## Gradient Descent - Boston Dataset
Boston dataset is one of the datasets available in sklearn.
You are given a Training dataset csv file with X train and Y train data. As studied in lecture, your task is to come up with Gradient Descent algorithm and thus predictions for the test dataset given.
Your task is to:
    1. Code Gradient Descent for N features and come with predictions.
    2. Try and test with various combinations of learning rates and number of iterations.
    3. Try using Feature Scaling, and see if it helps you in getting better results. 
Read Instructions carefully -
    1. Use Gradient Descent as a training algorithm and submit results predicted.
    2. Files are in csv format, you can use genfromtxt function in numpy to load data from csv file. Similarly you can use savetxt function to save data into a file.
    3. Submit a csv file with only predictions for X test data. File name should not have spaces. File should not have any headers and should only have one column i.e. predictions. Also predictions shouldn't be in exponential form. 
    4. Your score is based on coefficient of determination.


In [1]:
import numpy as np
import pandas as pd

In [2]:
# Loading Datasets

train_dataset = np.genfromtxt("../training_boston_x_y_train.csv", names = True, delimiter = ",")
train_df = pd.DataFrame(train_dataset)
print(train_df.describe())
num_cols = len(train_df.columns)
X_train = train_df.values[:, 0: num_cols -1 ]
Y_train = train_df.values[:, num_cols - 1]
print(X_train.shape)
print(Y_train.shape)

# print(X_train)
# print(Y_train)

test_dataset = np.genfromtxt("../test_boston_x_test.csv", names = None, delimiter = ",")
test_df = pd.DataFrame(test_dataset)
print(test_df.describe())
num_cols = len(test_df.columns)
X_test = test_df.values

print(X_test.shape)

# print(X_train)


             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  379.000000  379.000000  379.000000  379.000000  379.000000  379.000000   
mean     0.019628    0.002455    0.036170    0.028955    0.028775    0.032202   
std      1.067490    1.000813    1.017497    1.048995    0.999656    1.001174   
min     -0.417713   -0.487722   -1.516987   -0.272599   -1.465882   -3.880249   
25%     -0.408171   -0.487722   -0.867691   -0.272599   -0.878475   -0.571480   
50%     -0.383729   -0.487722   -0.180458   -0.272599   -0.144217   -0.103479   
75%      0.055208    0.156071    1.015999   -0.272599    0.628913    0.529069   
max      9.941735    3.804234    2.422565    3.668398    2.732346    3.555044   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  379.000000  379.000000  379.000000  379.000000  379.000000  379.000000   
mean     0.038395   -0.001288    0.043307    0.043786    0.019218   -0.015785   
std      0.985209    1.0278

In [5]:
train_df.columns = np.arange(14)
for i in range(13):
    for j in range(i, 13):
        train_df[str(i)+"_"+str(j)] = train_df[i]*train_df[j]
train_df.shape

(379, 105)

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
from sklearn import model_selection
"""
    n = 10
    sub_Size = 0.7
    Y_pred = np.zeros(len(X_test))
    Y_train_pred = np.zeros(int(sub_Size * len(X_train)))
    num_s = 0
    for i in range(n):
        X1, X2, Y1, Y2 = model_selection.train_test_split(X_train, Y_train, test_size = 1 - sub_Size)
        model = GradientBoostingRegressor(loss = 'huber', subsample = 0.9)
        model.fit(X1, Y1)
        #print("Training Score : ", model.score(X_train, Y_train))
        score = model.score(X1, Y1)
        if score >= 0:
            print(num_s," Training Score : ", score)
            Y_i_pred = model.predict(X_test)
            Y_pred += Y_i_pred
            Y_train_i_pred = model.predict(X1)
            Y_train_pred += Y_train_i_pred
            num_s += 1
    Y_pred /= num_s
    Y_train_pred /= num_s          
    
    #print(Y_pred, pred)
    return Y_train_pred, Y_pred
"""
def add_more_features (X_train, imp_cols_indices = []):
    
    X_train_df = pd.DataFrame(X_train)
    num_f = len(X_train_df.columns)
    col_names = X_train_df.columns
    
    if len(imp_cols_indices) == 0 :
        imp_cols_indices = np.arange(num_f)
    new_df = X_train_df.copy()
    num_imp_cols = len(imp_cols_indices)
    
    pow = 3
    
    while pow < 10 :
        for i1 in range(num_imp_cols) :
            i = imp_cols_indices[i1]
            new_df[ str(col_names[i]) + "_pow_" + str(pow) ] = X_train_df[i] ** pow
        pow += 1
    
    for i1 in range(num_imp_cols) :
        for j1 in range(i1, num_imp_cols):
            i = imp_cols_indices[i1]
            j = imp_cols_indices[j1]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])] = X_train_df[i] * X_train_df[j]
            new_df[ str(col_names[i])*2 + "_" + str(col_names[j])] = X_train_df[i]**2 * X_train_df[j]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])*2 ] = X_train_df[i] * X_train_df[j]**2
            new_df[ str(col_names[i])*2 + "_" + str(col_names[j])*2 ] = X_train_df[i]**2 * X_train_df[j]**2
            
            new_df[ str(col_names[i])*3 + "_" + str(col_names[j])] = X_train_df[i]**3 * X_train_df[j]
            new_df[ str(col_names[i])*3 + "_" + str(col_names[j])*2] = X_train_df[i]**3 * X_train_df[j]**2
            new_df[ str(col_names[i])*3 + "_" + str(col_names[j])*3 ] = X_train_df[i]**3 * X_train_df[j]**3
            new_df[ str(col_names[i])*2 + "_" + str(col_names[j])*3 ] = X_train_df[i]**2 * X_train_df[j]**3
            new_df[ str(col_names[i]) + "_" + str(col_names[j])*3 ] = X_train_df[i] * X_train_df[j]**3
            
            
            
    print(new_df.describe())
    return new_df.values

def add_cube_features (X_train):
    X_train_df = pd.DataFrame(X_train)
    num_f = len(X_train_df.columns)
    col_names = X_train_df.columns
    

    imp_cols_indices = np.arange(num_f)
    new_df = X_train_df.copy()
    num_imp_cols = len(imp_cols_indices)
    
    pow = 3
    
    for i1 in range(num_imp_cols) :
        i = imp_cols_indices[i1]
        new_df[ str(col_names[i]) + "_pow_" + str(pow) ] = X_train_df[i] ** 3
    """
    
    for i1 in range(num_imp_cols) :
        for j1 in range(i1, num_imp_cols):
            i = imp_cols_indices[i1]
            j = imp_cols_indices[j1]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])] = X_train_df[i] * X_train_df[j]
    """
    #print(new_df.describe())
    return new_df.values
    

In [5]:
from sklearn import preprocessing as pps

X_train_new = add_more_features(X_train)
X_test_new = add_more_features(X_test)

#X_train_new = add_more_features(X_train_new)
#X_test_new = add_more_features(X_test_new)

std_scaler = pps.StandardScaler()
std_scaler.fit(X_train_new)

X_train_scaled = std_scaler.transform(X_train_new)
X_test_scaled = std_scaler.transform(X_test_new)

import copy
X_train_original = copy.deepcopy(X_train)
X_test_original = copy.deepcopy(X_test)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape)
print(X_test.shape)


                0           1           2           3           4           5  \
count  379.000000  379.000000  379.000000  379.000000  379.000000  379.000000   
mean     0.019628    0.002455    0.036170    0.028955    0.028775    0.032202   
std      1.067490    1.000813    1.017497    1.048995    0.999656    1.001174   
min     -0.417713   -0.487722   -1.516987   -0.272599   -1.465882   -3.880249   
25%     -0.408171   -0.487722   -0.867691   -0.272599   -0.878475   -0.571480   
50%     -0.383729   -0.487722   -0.180458   -0.272599   -0.144217   -0.103479   
75%      0.055208    0.156071    1.015999   -0.272599    0.628913    0.529069   
max      9.941735    3.804234    2.422565    3.668398    2.732346    3.555044   

                6           7           8           9      ...       \
count  379.000000  379.000000  379.000000  379.000000      ...        
mean     0.038395   -0.001288    0.043307    0.043786      ...        
std      0.985209    1.027803    1.016265    1.019974    

In [6]:
# extract best features

from sklearn.decomposition import PCA

pca = PCA(200)
pca.fit(np.concatenate((X_train, X_test), axis = 0))

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

print(X_train.shape)
print(X_test.shape)


(379, 200)
(127, 200)


In [17]:
model = GradientBoostingRegressor(loss = 'huber', subsample = 1, n_estimators = 100, max_depth =  5 )
model.fit(X_train, Y_train)
print("Training Score: ", model.score(X_train, Y_train))
Y_pred = model.predict(X_test)

Training Score:  0.999382177435


In [18]:
np.savetxt("boston_dataset_pred.csv", Y_pred, '%.5f')