In [69]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

from SGDRegressor import SGDRegressor


%matplotlib inline

In [70]:

def load_data(path):
	return pd.read_csv(path, header=None)

data = load_data('year-prediction-msd-train.txt')

data  = np.asarray(data)

Y = data[:,0]
X = data[:,1:]

In [71]:
def normalize(X):
    Xc = X - X.min(axis=0)
    Xc /= Xc.max(axis=0)
    return Xc

def scale(X):
    scaler = StandardScaler()
    scaler.fit(X)
    return scaler.transform(X)

def pca(X, n):
    pca = PCA(n_components=n)
    return pca.fit_transform(X)

def mir(X, Y, tresholder):
    mi = mutual_info_regression(X_training, Y_training)
    mi_sort = sorted(mi, reverse=True)
    mi_index_sort = np.argsort(mi)[::-1]
    mi = np.asarray(mi_sort)

    index_to_remove = np.where(mi < tresholder)[0]
    
    return index_to_remove

def remove_features(X, features):
    return np.delete(X, features, axis=1)


def cross_validation(X, Y, sgd, n_splits):
    cv = KFold(n_splits=n_splits)
    scores = list()

    for train, test in cv.split(X, Y):
        sgd.fit(X[train], Y[train])
        y_pred = sgd.predict(X[test])
        scores.append(np.sqrt(mean_squared_error(Y[test], y_pred)))
        
    return scores

In [72]:
X_scaled = scale(X)
X_train_scaled, X_test_scaled, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

X_pca = pca(X, 0.95)

indexes_to_remove = mir(X, Y, 0.09)

X_mir = remove_features(scale(X), indexes_to_remove)

X_normalize = normalize(X)

In [81]:

print(X_mir.shape)

(111452, 32)


In [73]:
#Normal Equation
def normal_equation(X, Y, X_test, Y_test):
    regression = linear_model.LinearRegression()

    regression.fit(X, Y)

    y_pred = regression.predict(X_test)

    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "%mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "%np.sqrt(mean_squared_error(Y_test, y_pred)))
    

In [74]:
#SGD
def SGD(X, Y, X_test, Y_test):
    sgd = SGDRegressor(learning_rate=0.001, max_iter=50, batch_size=20)
    
    sgd.fit(X, Y)
    y_pred = sgd.predict(X_test)
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))

In [75]:
def SGD_cross_validation(X, Y, max_iter=50, batch_size=20, n_splits=5, learning_rate=0.001):
    scores = cross_validation(X, Y, SGDRegressor(learning_rate, max_iter, batch_size), n_splits)
    print("Error ", np.asarray(scores).mean())

In [79]:
def polymonial_regression(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)

    X_test_poly =  poly_features.fit_transform(X_test)
    
    sgd = SGDRegressor(learning_rate=0.001, max_iter=40, batch_size=20)
    sgd.fit(X_train_poly, Y_train)

    y_pred = sgd.predict(X_test_poly)
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))

In [76]:
#Baseline
SGD(X_train_scaled, Y_train, X_test_scaled, Y_test)

shape of Y (89161,)
epocha 0
score -0.215725745908
epocha 1
score 0.0904424042137
epocha 2
score -0.0901379324509
epocha 3
score 0.182882730895
epocha 4
score 0.179909364853
epocha 5
score 0.191452704785
epocha 6
score 0.217420877428
epocha 7
score 0.217500249983
epocha 8


KeyboardInterrupt: 

In [None]:
#Cross validation
SGD_cross_validation(X_scaled, Y)

In [80]:
#Polynomial regression
polymonial_regression(X_scaled, Y)

shape of Y (89161,)
epocha 0
score -1.41513932304e+27
epocha 1
score -2.78612674766e+26
epocha 2
score -1.31905085902e+26
epocha 3
score -4.55869007578e+25
epocha 4
score -1.97559883177e+26
epocha 5
score -1.40403493587e+25
epocha 6
score -1.03239278698e+25
epocha 7


KeyboardInterrupt: 

In [None]:
#Cross validation with PCA
SGD_cross_validation(X_pca_scaled, Y)

In [None]:
#Cross validation with MIR
SGD_cross_validation(X_pca_scaled, Y)