## Gradient Descent - Boston Dataset
Boston dataset is one of the datasets available in sklearn.
You are given a Training dataset csv file with X train and Y train data. As studied in lecture, your task is to come up with Gradient Descent algorithm and thus predictions for the test dataset given.
Your task is to:
    1. Code Gradient Descent for N features and come with predictions.
    2. Try and test with various combinations of learning rates and number of iterations.
    3. Try using Feature Scaling, and see if it helps you in getting better results. 
Read Instructions carefully -
    1. Use Gradient Descent as a training algorithm and submit results predicted.
    2. Files are in csv format, you can use genfromtxt function in numpy to load data from csv file. Similarly you can use savetxt function to save data into a file.
    3. Submit a csv file with only predictions for X test data. File name should not have spaces. File should not have any headers and should only have one column i.e. predictions. Also predictions shouldn't be in exponential form. 
    4. Your score is based on coefficient of determination.


In [1]:
import numpy as np
import pandas as pd

In [2]:
# Loading Datasets

train_dataset = np.genfromtxt("../training_ccpp_x_y_train.csv", names = True, delimiter = ",")
train_df = pd.DataFrame(train_dataset)
print(train_df.describe())
num_cols = len(train_df.columns)
X_train = train_df.values[:, 0: num_cols -1 ]
Y_train = train_df.values[:, num_cols - 1]
print(X_train.shape)
print(Y_train.shape)

# print(X_train)
# print(Y_train)

test_dataset = np.genfromtxt("../test_ccpp_x_test.csv", names = None, delimiter = ",")
test_df = pd.DataFrame(test_dataset)
print(test_df.describe())
num_cols = len(test_df.columns)
X_test = test_df.values

print(X_test.shape)

# print(X_train)

                 T            V           AP           RH           EP
count  7176.000000  7176.000000  7176.000000  7176.000000  7176.000000
mean     19.629712    54.288154  1013.263032    73.275818   454.431293
std       7.475256    12.751468     5.964863    14.625093    17.134571
min       1.810000    25.360000   992.890000    25.560000   420.260000
25%      13.470000    41.740000  1009.010000    63.202500   439.737500
50%      20.315000    52.050000  1012.910000    74.895000   451.740000
75%      25.720000    66.540000  1017.302500    84.925000   468.667500
max      35.770000    81.560000  1033.300000   100.160000   495.760000
(7176, 4)
(7176,)
                0            1            2            3
count  2392.00000  2392.000000  2392.000000  2392.000000
mean     19.71579    54.358754  1013.247216    73.408457
std       7.38488    12.578763     5.861068    14.528135
min       3.38000    25.360000   993.740000    26.670000
25%      13.66000    41.730000  1009.300000    63.615000
5

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
from sklearn import model_selection

def train_n_models(X_train, Y_train, X_test):
    n = 3
    sub_Size = 1
    Y_pred = np.zeros(len(X_test))
    Y_train_pred = np.zeros(int(sub_Size * len(X_train)))
    num_s = 0
    for i in range(n):
        X1, X2, Y1, Y2 = model_selection.train_test_split(X_train, Y_train, test_size = 1 - sub_Size)
        model = GradientBoostingRegressor(ls = 'huber', subsample = 0.95)
        model.fit(X1, Y1)
        #print("Training Score : ", model.score(X_train, Y_train))
        score = model.score(X1, Y1)
        if score >= 0:
            print(num_s," Training Score : ", score)
            Y_i_pred = model.predict(X_test)
            Y_pred += Y_i_pred
            Y_train_i_pred = model.predict(X1)
            Y_train_pred += Y_train_i_pred
            num_s += 1
    Y_pred /= num_s
    Y_train_pred /= num_s          
    
    #print(Y_pred, pred)
    return Y_train_pred, Y_pred

def add_more_features (X_train, imp_cols_indices = []):
    
    X_train_df = pd.DataFrame(X_train)
    num_f = len(X_train_df.columns)
    col_names = X_train_df.columns
    
    if len(imp_cols_indices) == 0 :
        imp_cols_indices = np.arange(num_f)
    new_df = X_train_df.copy()
    num_imp_cols = len(imp_cols_indices)
    
    pow = 3
    
    for i1 in range(num_imp_cols) :
        i = imp_cols_indices[i1]
        new_df[ str(col_names[i]) + "_pow_" + str(pow) ] = X_train_df[i] ** 3

    for i1 in range(num_imp_cols) :
        for j1 in range(i1, num_imp_cols):
            i = imp_cols_indices[i1]
            j = imp_cols_indices[j1]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])] = X_train_df[i] * X_train_df[j]
            new_df[ str(col_names[i])*2 + "_" + str(col_names[j])] = X_train_df[i]**2 * X_train_df[j]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])*2 ] = X_train_df[i] * X_train_df[j]**2
            new_df[ str(col_names[i])*2 + "_" + str(col_names[j])*2 ] = X_train_df[i]**2 * X_train_df[j]**2
            
    
    return new_df.values

def add_cube_features (X_train):
    X_train_df = pd.DataFrame(X_train)
    num_f = len(X_train_df.columns)
    col_names = X_train_df.columns
    

    imp_cols_indices = np.arange(num_f)
    new_df = X_train_df.copy()
    num_imp_cols = len(imp_cols_indices)
    
    pow = 3
    
    for i1 in range(num_imp_cols) :
        i = imp_cols_indices[i1]
        new_df[ str(col_names[i]) + "_pow_" + str(pow) ] = X_train_df[i] ** 3
    """
    
    for i1 in range(num_imp_cols) :
        for j1 in range(i1, num_imp_cols):
            i = imp_cols_indices[i1]
            j = imp_cols_indices[j1]
            new_df[ str(col_names[i]) + "_" + str(col_names[j])] = X_train_df[i] * X_train_df[j]
    """
    #print(new_df.describe())
    return new_df.values
    

In [5]:
from sklearn import preprocessing as pps

X_train_new = add_more_features(X_train)
X_test_new = add_more_features(X_test)

#X_train_new = add_more_features(X_train_new)
#X_test_new = add_more_features(X_test_new)

std_scaler = pps.StandardScaler()
std_scaler.fit(X_train_new)

X_train_scaled = std_scaler.transform(X_train_new)
X_test_scaled = std_scaler.transform(X_test_new)

import copy
X_train_original = copy.deepcopy(X_train)
X_test_original = copy.deepcopy(X_test)

X_train = X_train_scaled
X_test = X_test_scaled

print(X_train.shape)
print(X_test.shape)


(7176, 48)
(2392, 48)


In [7]:
# extract best features

from sklearn.decomposition import PCA

pca = PCA(20)
pca.fit(np.concatenate((X_train, X_test), axis = 0))

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

print(X_train.shape)
print(X_test.shape)


(7176, 20)
(2392, 20)


In [8]:
Y_train_pred, Y_pred = train_n_models(X_train, Y_train, X_test)

0  Training Score :  0.95302892316
1  Training Score :  0.953009801315
2  Training Score :  0.952794772998


In [43]:
#print(score(Y_train, Y_train_pred))

In [9]:
np.savetxt("ccpp_dataset_pred.csv", Y_pred, '%.5f')