In [23]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

from SGDRegressor import SGDRegressor


%matplotlib inline

In [2]:

def load_data(path):
	return pd.read_csv(path, header=None)

data = load_data('year-prediction-msd-train.txt')

data  = np.asarray(data)

Y = data[:,0]
X = data[:,1:]

In [3]:
def normalize(X):
    Xc = X - X.min(axis=0)
    Xc /= Xc.max(axis=0)
    return Xc

def scale(X):
    scaler = StandardScaler()
    scaler.fit(X)
    return scaler.transform(X)

def pca(X, n):
    pca = PCA(n_components=n)
    return pca.fit_transform(X)

def mir(X, Y, tresholder):
    mi = mutual_info_regression(X, Y)
    mi_sort = sorted(mi, reverse=True)
    mi_index_sort = np.argsort(mi)[::-1]
    mi = np.asarray(mi_sort)

    index_to_remove = np.where(mi < tresholder)[0]
    
    return index_to_remove

def remove_features(X, features):
    return np.delete(X, features, axis=1)


def cross_validation(X, Y, sgd, n_splits):
    cv = KFold(n_splits=n_splits)
    scores = list()

    for train, test in cv.split(X, Y):
        sgd.fit(X[train], Y[train])
        y_pred = sgd.predict(X[test])
        scores.append(np.sqrt(mean_squared_error(Y[test], y_pred)))
        
    return scores

In [4]:
X_scaled = scale(X)
X_train_scaled, X_test_scaled, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

X_pca = pca(X, 0.95)

indexes_to_remove = mir(X, Y, 0.09)

X_mir = remove_features(scale(X), indexes_to_remove)

X_normalize = normalize(X)

In [81]:

print(X_mir.shape)

(111452, 32)


In [5]:
#Normal Equation
def normal_equation(X, Y, X_test, Y_test):
    regression = linear_model.LinearRegression()

    regression.fit(X, Y)

    y_pred = regression.predict(X_test)

    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "%mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "%np.sqrt(mean_squared_error(Y_test, y_pred)))
    

In [37]:
#SGD
def SGD(X, Y, X_test, Y_test):
    sgd = SGDRegressor(learning_rate=0.001, max_iter=50, batch_size=20)
    sgd.fit(X, Y)
    y_pred = sgd.predict(X_test)

#     sgd.fit(X, Y, X_test, Y_test)
    #train_errors = sgd.getTrain_errors()
    #plt.plot(range(1, len(train_errors)), np.log10(train_errors))
    #plt.xlabel('Epochas')
    #plt.ylabel('Cost')
    #plt.title('Cost vs Iterations')
    
    #plt.tight_layout()
    #plt.show()
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))
    

In [7]:
def SGD_cross_validation(X, Y, max_iter=50, batch_size=20, n_splits=5, learning_rate=0.001):
    scores = cross_validation(X, Y, SGDRegressor(learning_rate, max_iter, batch_size), n_splits)
    print("Error ", np.asarray(scores).mean())

In [8]:
def polymonial_regression(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)

    X_test_poly =  poly_features.fit_transform(X_test)
    
    sgd = SGDRegressor(learning_rate=0.001, max_iter=40, batch_size=20)
    sgd.fit(X_train_poly, Y_train)

    y_pred = sgd.predict(X_test_poly)
    
    print("R@: %.2f "%r2_score(Y_test, y_pred))
    print("MSE %.2f "% mean_squared_error(Y_test, y_pred))
    print("RMSE %.2f "% np.sqrt(mean_squared_error(Y_test, y_pred)))

In [None]:
#Baseline
SGD(X_train_scaled, Y_train, X_test_scaled, Y_test)

epocha 0
epocha 1
epocha 2
epocha 3
epocha 4
epocha 5
epocha 6
epocha 7
epocha 8
epocha 9
epocha 10
epocha 11
epocha 12
epocha 13
epocha 14
epocha 15
epocha 16
epocha 17
epocha 18
epocha 19
epocha 20
epocha 21
epocha 22
epocha 23
epocha 24
epocha 25


In [9]:
#Cross validation
SGD_cross_validation(X_scaled, Y)

TypeError: 'float' object cannot be interpreted as an integer

In [11]:
#Polynomial regression
polymonial_regression(X_scaled, Y)

MemoryError: 

In [None]:
#Cross validation with PCA
SGD_cross_validation(X_pca_scaled, Y)

In [None]:
#Cross validation with MIR
SGD_cross_validation(X_pca_scaled, Y)