In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel

from SGDRegressor import SGDRegressor


%matplotlib inline

In [14]:

def load_data(path):
	return pd.read_csv(path, header=None)

data = load_data('year-prediction-msd-train.txt')

data  = np.asarray(data)

Y = data[:,0]
X = data[:,1:]

In [33]:
def normalize(X):
    Xc = X - X.min(axis=0)
    Xc /= Xc.max(axis=0)
    return Xc

def scale(X):
    scaler = StandardScaler()
    scaler.fit(X)
    return scaler.transform(X)

def pca(X, n):
    pca = PCA(n_components=n)
    return pca.fit_transform(X)

def mir(X, Y):
    mi = mutual_info_regression(X, Y)
    mi_sort = sorted(mi, reverse=True)
    mi_index_sort = np.argsort(mi)[::-1]
    mi = np.asarray(mi_sort)
    
    return mi

def index_to_remove(mi, treshold):
    return np.where(mi < treshold)[0]
    

def remove_features(X, features):
    return np.delete(X, features, axis=1)


def cross_validation(X, Y, max_iter=50, batch_size=20, n_splits=5, learning_rate=0.001):
    cv = KFold(n_splits=n_splits)
    scores = list()
    best_model = None
    less_erro = 100000000000000000000000
    for train, test in cv.split(X, Y):
        sgd = SGDRegressor(learning_rate=learning_rate, max_iter=max_iter, batch_size=batch_size)
        sgd.fit(X[train], Y[train])
        y_pred = sgd.predict(X[test])
        error = np.sqrt(mean_squared_error(Y[test], y_pred))
        scores.append(error)
        
        if error < less_error:
            less_error = error
            best_model = sgd
        
    return scores

def variance(X, threshold):
    selector = VarianceThreshold(threshold = threshold)
    return selector.fit_transform(X)

def tree(X, Y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(X, Y)
    model = SelectFromModel(clf, prefit=True)
    return model.transform(X)

In [16]:
X_scaled = scale(X)
X_train_scaled, X_test_scaled, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

X_pca = pca(X, 0.95)

X_normalize = normalize(X)

In [17]:
X_tree = tree(X_scaled, Y)

In [6]:
print(X_tree.shape)

(111452, 30)


In [18]:
#Normal Equation
def normal_equation(X, Y, X_test, Y_test):
    regression = linear_model.LinearRegression()

    regression.fit(X, Y)

    y_pred = regression.predict(X_test)

    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "%mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "%np.sqrt(mean_squared_error(Y_test, y_pred)))
    

In [19]:
#SGD
def SGD(X, Y, X_test, Y_test):
    sgd = SGDRegressor(learning_rate=0.001, max_iter=200, batch_size=20)
    
    sgd.fit(X, Y)
    y_pred = sgd.predict(X_test)

    plt.plot(range(0, len(train_errors)), np.log10(train_errors))
    plt.xlabel('Epochas')
    plt.ylabel('Cost')
    plt.title('Cost vs Iterations')

    plt.tight_layout()
    plt.show()
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))
    

In [34]:
def SGD_cross_validation(X, Y, max_iter=50, batch_size=20, n_splits=5, learning_rate=0.001):
    scores, best_model = cross_validation(X, Y, max_iter=max_iter, batch_size=batch_size, n_splits=n_splits, learning_rate=learning_rate)
    print("Error ", np.asarray(scores).mean())
    return best_model

In [35]:
def polymonial_regression(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)

    X_test_poly =  poly_features.fit_transform(X_test)
    
    sgd = SGDRegressor(learning_rate=0.001, max_iter=100, batch_size=20)
    sgd.fit(X_train_poly, Y_train)

    y_pred = sgd.predict(X_test_poly)
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))

In [32]:
#Baseline
SGD(X_train_scaled, Y_train, X_test_scaled, Y_test)

epocha 0
epocha 1
epocha 2
epocha 3


KeyboardInterrupt: 

In [31]:
#Cross validation
SGD_cross_validation(X_scaled, Y)

TypeError: 'float' object cannot be interpreted as an integer

In [82]:
#Normal equation
X_train_n, X_test_n, Y_train_n, Y_test_n = train_test_split(X_tree, Y, test_size=0.2, random_state=0)
normal_equation(X_train_n, Y_train_n, X_test_n, Y_test_n)

SGD(X_train_n, Y_train_n, X_test_n, Y_test_n)

R@: 0.19 
MSE 94.70 
RMSE 9.73 
epocha 0
epocha 1
epocha 2
epocha 3
epocha 4
epocha 5
epocha 6
epocha 7
epocha 8
epocha 9
epocha 10
epocha 11
epocha 12
epocha 13
epocha 14
epocha 15
epocha 16
epocha 17
epocha 18
epocha 19
epocha 20
epocha 21
epocha 22
epocha 23
epocha 24
epocha 25
epocha 26
epocha 27
epocha 28
epocha 29
epocha 30
epocha 31
epocha 32
epocha 33
epocha 34
epocha 35
epocha 36
epocha 37
epocha 38
epocha 39
epocha 40
epocha 41
epocha 42
epocha 43
epocha 44
epocha 45
epocha 46
epocha 47
epocha 48
epocha 49
epocha 50
epocha 51
epocha 52
epocha 53
epocha 54
epocha 55
epocha 56
epocha 57
epocha 58
epocha 59
epocha 60
epocha 61
epocha 62
epocha 63
epocha 64
epocha 65
epocha 66
epocha 67
epocha 68
epocha 69
epocha 70
epocha 71
epocha 72
epocha 73
epocha 74
epocha 75
epocha 76
epocha 77
epocha 78
epocha 79
epocha 80
epocha 81
epocha 82
epocha 83
epocha 84
epocha 85
epocha 86
epocha 87
epocha 88
epocha 89
epocha 90
epocha 91
epocha 92
epocha 93
epocha 94
epocha 95
epocha 96
epocha 9

In [95]:
#Polynomial regression
x_pca_poly = pca(X_scaled, 0.9)
print(x_pca_poly.shape)
polymonial_regression(X_tree, Y)

(111452, 55)
R@: 0.25 
MSE 87.81 
RMSE 9.37 
R@: -181844494456940633522176.00 
MSE 21381116265307558515834880.00 
RMSE 4623971914415.96 


In [29]:
#Cross validation with PCA
SGD_cross_validation(X, Y)

TypeError: 'float' object cannot be interpreted as an integer

In [None]:
#Cross validation with MIR
SGD_cross_validation(X_pca_scaled, Y)