In [7]:
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

from sklearn.datasets import load_boston
boston = load_boston()

# Choose to run on the test data or submission data
test_or_submission = 'test'


# Loading test data
if test_or_submission == 'test':
    bos = pd.DataFrame(boston.data)
    bos.columns = boston.feature_names
    bos['PRICE'] = boston.target
    bos['ID'] = bos.index
    
    Y = bos[['ID','PRICE']]
    X = bos
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 5)

# Kaggle subbmission file loading:
else:
    # Renaming or adding columns to create similar format to test data
    bos_train = pd.read_csv('BostonHousingTrain.csv')
    bos_train.columns=[x.upper() for x in bos_train.columns]
    bos_train=bos_train.rename(columns = {'MEDV':'PRICE'})
    bos_train=bos_train.rename(columns = {'BLACK':'B'})

    Y_train = bos_train[['ID','PRICE']]
    X_train = bos_train
    Y_train['ID_INDEX'] = Y_train['ID'] 
    X_train['ID_INDEX'] = X_train['ID']     
    Y_train.set_index('ID_INDEX', inplace=True)
    X_train.set_index('ID_INDEX', inplace=True)
    
    bos_test = pd.read_csv('BostonHousingTest.csv')
    bos_test.columns=[x.upper() for x in bos_test.columns] 
    bos_test['ID_INDEX'] = bos_test['ID']     
    bos_test.set_index('ID_INDEX', inplace=True)
    bos_test['PRICE'] = 0
    bos_test=bos_test.rename(columns = {'BLACK':'B'})
    X_test = bos_test

print(X_train.shape)
print(Y_train.shape)

print(X_train.head())
print(Y_train.head())

print(X_test.head())

(379, 15)
(379, 2)
        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
218  0.11069   0.0  13.89   1.0  0.550  5.951  93.8  2.8893  5.0  276.0   
117  0.15098   0.0  10.01   0.0  0.547  6.021  82.6  2.7474  6.0  432.0   
195  0.01381  80.0   0.46   0.0  0.422  7.875  32.0  5.6484  4.0  255.0   
162  1.83377   0.0  19.58   1.0  0.605  7.802  98.2  2.0407  5.0  403.0   
219  0.11425   0.0  13.89   1.0  0.550  6.373  92.4  3.3633  5.0  276.0   

     PTRATIO       B  LSTAT  PRICE   ID  
218     16.4  396.90  17.92   21.5  218  
117     17.8  394.51  10.30   19.2  117  
195     14.4  394.23   2.97   50.0  195  
162     14.7  389.61   1.92   50.0  162  
219     16.4  393.74  10.50   23.0  219  
      ID  PRICE
218  218   21.5
117  117   19.2
195  195   50.0
162  162   50.0
219  219   23.0
         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
226   0.38214   0.0   6.20   0.0  0.504  8.040   86.5  3.2157   8.0  307.0   
292   0.03615  80.0   4.95

In [8]:

# Remove where PRICE is 50
# Removing rows with Y = 50
removeIndex = pd.DataFrame(columns=['ID'])
removeIndex.dtype = np.int64

Y_train.drop(removeIndex['ID'], inplace=True)
X_train.drop(removeIndex['ID'], inplace=True)

# Only keep feature identified in Data Exploration and Feature Selection
# After a lot of effort with data exploration and feature selection, it turn out that 
# using all of the features gave the best results
col_to_drop = ['ID','PRICE']

X_train = X_train.drop(col_to_drop, axis = 1)
Y_train = Y_train.drop(['ID'], axis = 1)
X_test = X_test.drop(col_to_drop, axis = 1)

if test_or_submission == 'test':
    Y_test = Y_test.drop(['ID'], axis = 1)


print(X_train.head())
print(Y_train.head())
print(X_test.head())

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
218  0.11069   0.0  13.89   1.0  0.550  5.951  93.8  2.8893  5.0  276.0   
117  0.15098   0.0  10.01   0.0  0.547  6.021  82.6  2.7474  6.0  432.0   
195  0.01381  80.0   0.46   0.0  0.422  7.875  32.0  5.6484  4.0  255.0   
162  1.83377   0.0  19.58   1.0  0.605  7.802  98.2  2.0407  5.0  403.0   
219  0.11425   0.0  13.89   1.0  0.550  6.373  92.4  3.3633  5.0  276.0   

     PTRATIO       B  LSTAT  
218     16.4  396.90  17.92  
117     17.8  394.51  10.30  
195     14.4  394.23   2.97  
162     14.7  389.61   1.92  
219     16.4  393.74  10.50  
     PRICE
218   21.5
117   19.2
195   50.0
162   50.0
219   23.0
         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
226   0.38214   0.0   6.20   0.0  0.504  8.040   86.5  3.2157   8.0  307.0   
292   0.03615  80.0   4.95   0.0  0.411  6.630   23.4  5.1167   4.0  245.0   
90    0.04684   0.0   3.41   0.0  0.489  6.417   66.1  3.0923   2.0  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

final_results = {}
final_data = {}

lm = LinearRegression()
lm.fit(X_train, Y_train)

Y_pred = lm.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['Linear'] = np.sqrt(mse)
    
Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['Linear'] = temp_final_data


    
# L2 Regularization
# force to small coefficients
from sklearn.linear_model import Ridge 

ridge = Ridge(alpha=7) 
ridge.fit(X_train, Y_train)
Y_pred = ridge.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['Ridge'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['Ridge'] = temp_final_data


# L1 Regularization
# force coefficients to 0
from sklearn.linear_model import Lasso 

lasso = Lasso(alpha=.05)
lasso.fit(X_train, Y_train)
Y_pred = lasso.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['Lasso'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['Lasso'] = temp_final_data

# ElasticNet
# ElasticNet is a form of regularization regression that combines the properties of both 
# Ridge Regression and LASSO regression
from sklearn.linear_model import ElasticNet 
elastic = ElasticNet() 
elastic.fit(X_train,Y_train)
Y_pred = elastic.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['Elastic'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['Elastic'] = temp_final_data


# Random Forest
# Random forests or random decision forests are an ensemble learning method for classification, regression 
# and other tasks, that operate by constructing a multitude of decision trees at training time and outputting 
# the class that is the mode of the classes (classification) or mean prediction (regression) of the 
# individual trees.[1][2] Random decision forests correct for decision trees habit of overfitting to their training set.
from sklearn.ensemble import RandomForestRegressor 
rf = RandomForestRegressor()
rf.fit(X_train,Y_train)
Y_pred = rf.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['RF'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['RF'] = temp_final_data


# ADA Boost
from sklearn.ensemble import AdaBoostRegressor 

ada = AdaBoostRegressor()
ada.fit(X_train, Y_train)
Y_pred = lm.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['ADA'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['ADA'] = temp_final_data


# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
gbr.fit(X_train, Y_train)
Y_pred = gbr.predict(X_test)

if test_or_submission == 'test':
    mse = sklearn.metrics.mean_squared_error(Y_test, Y_pred)
    final_results['GB'] = np.sqrt(mse)

Y_pred = Y_pred.reshape((-1,1))
temp_final_data = pd.DataFrame({'ID':X_test.index.values, 'MEDV':Y_pred[:,0]}, columns=['ID','MEDV'])
temp_final_data.columns = ['ID','MEDV']
final_data['GB'] = temp_final_data

for k,v in final_results.items():
    print(k, v)
    
# Create final submission file
final_data['GB'].to_csv('BostonHousingSubmission.csv', header=True, index=False)

# A score of 3.71 using the test submission data would put it in the top 30 on the public Leaderboard

Linear 4.9293108817
Ridge 5.07873737681
Lasso 5.11224887212
Elastic 5.3802950027
RF 3.24989303276
ADA 4.9293108817
GB 2.78285702828


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
