In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm, linear_model, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel

In [2]:
df = pd.read_csv('data/ParisHousing.csv') #loads data

In [3]:
pd.set_option('display.precision', 2) #formatting for printed output
#remove binary features from summary
df.drop(columns = ['hasYard', 'hasPool', 'isNewBuilt', 'hasStormProtector',
                   'hasStorageRoom', 'hasGuestRoom']).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
squareMeters,10000.0,49900.0,28800.0,89.0,25100.0,50100.0,74600.0,100000.0
numberOfRooms,10000.0,50.4,28.8,1.0,25.0,50.0,75.0,100.0
floors,10000.0,50.3,28.9,1.0,25.0,50.0,76.0,100.0
cityCode,10000.0,50200.0,29000.0,3.0,24700.0,50700.0,75700.0,100000.0
cityPartRange,10000.0,5.51,2.87,1.0,3.0,5.0,8.0,10.0
numPrevOwners,10000.0,5.52,2.86,1.0,3.0,5.0,8.0,10.0
made,10000.0,2010.0,9.31,1990.0,2000.0,2010.0,2010.0,2020.0
basement,10000.0,5030.0,2880.0,0.0,2560.0,5090.0,7510.0,10000.0
attic,10000.0,5030.0,2890.0,1.0,2510.0,5040.0,7540.0,10000.0
garage,10000.0,553.0,262.0,100.0,328.0,554.0,777.0,1000.0


Classification on cityPartRange (1-10 value representing the desirability of a neighborhood)

In [None]:
X = df.drop(columns='cityPartRange')
y = df.cityPartRange
# split 50% training data, 50% "_tmp" for validation & test
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.5,
                                                  random_state=0, stratify=y)
# of remaining 50%, split in half to get 25% validation, 25% test
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
  random_state=0, stratify=y_tmp)

In [None]:
def find_best_clf(clfs, params, X_train, X_valid, y_train, y_valid, only_best = True):
  #combines clfs and gridsearch params into a dataframe
  models = pd.DataFrame({'clf':clfs, 'params':params})
  #creates a grid search of each clf with associated params
  models['gridsearch'] = models.apply(lambda x: GridSearchCV(x['clf'], x['params'], n_jobs =-1, refit=True), 
                                      axis = 1)
  #fits each grid search to training data
  models.apply(lambda x: x['gridsearch'].fit(X_train, y_train), axis=1)
  #finds accuracy score of each clf on validation data
  models['accuracy'] = models.apply(lambda x: x['gridsearch'].score(X_valid, y_valid), axis = 1)
  #sorts models by accuracy
  models.sort_values('accuracy', ascending=False, inplace=True)
  #returns estimator with highest validation accuracy if only_best is true
  #else returns dataframe of all gridsearchs and acurracies sorted by accuracy
  return models.iloc[0].gridsearch.best_estimator_ if only_best else models[['gridsearch','accuracy']]

In [None]:
clf = [#DecisionTreeClassifier(random_state=0),
       Pipeline(steps=[('pca', PCA()), ('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
      ]
params = [
        #{
        #    'max_depth': np.arange(1,20,2), 
        #    'min_samples_split': np.arange(2,20,2),
        #    'min_samples_leaf': np.arange(1,20,2)
        #},
        {
            'pca__n_components': np.arange(1,10),
            'knn__n_neighbors': np.arange(1,20,2),
            'knn__leaf_size': np.arange(1,20,2),
            'knn__p':[1,2]
        }
         ]
grid_search = find_best_clf(clf, params, X_train, X_valid, y_train, y_valid)

In [None]:
grid_search.score(X_test, y_test)

Regression on price

In [4]:
X = df.drop(columns='price')
y = df.price
# split 80% training data, 20% "_tmp" for validation & test
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2,
                                                  random_state=0)
# of remaining 20%, split in half to get 10% validation, 10% test
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
  random_state=0)

In [7]:
# I think we should select the main variables instead of using all of them
# also the scores and MSEs are crazy
X.corr()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom
squareMeters,1.0,0.00957,-0.00665,-0.00559,0.00111,-0.00154,0.00876,0.0166,-0.00721,-0.0107,0.00748,-0.00396,-0.000588,-0.0172,-0.00349,-0.000623
numberOfRooms,0.00957,1.0,-0.0112,0.017,0.0222,0.00904,0.00834,0.0168,0.00398,-0.00286,-0.00166,-0.014,0.0121,0.0232,-0.00476,-0.0155
hasYard,-0.00665,-0.0112,1.0,0.0155,-0.000883,0.00676,0.00502,0.00428,0.00221,-0.00837,-0.0076,-0.00856,-0.00308,-0.00463,-0.00951,-0.00728
hasPool,-0.00559,0.017,0.0155,1.0,-0.00401,0.00807,0.0146,-0.00685,0.00189,0.000188,-0.001,-0.00727,-0.0119,0.00483,0.00124,0.00112
floors,0.00111,0.0222,-0.000883,-0.00401,1.0,0.00221,-0.00492,0.00246,0.00502,0.00246,-0.00857,0.00623,-0.00027,0.0113,0.00362,-0.0212
cityCode,-0.00154,0.00904,0.00676,0.00807,0.00221,1.0,0.0113,-0.00755,0.00927,-0.000224,-0.00494,0.00265,-0.00202,-0.00221,0.00255,-0.00334
cityPartRange,0.00876,0.00834,0.00502,0.0146,-0.00492,0.0113,1.0,0.00924,0.00775,-0.00187,0.00522,0.00474,0.0107,-0.00165,-0.0113,-0.00715
numPrevOwners,0.0166,0.0168,0.00428,-0.00685,0.00246,-0.00755,0.00924,1.0,0.00686,-0.0174,0.00252,-0.000862,0.000719,0.0203,0.0317,-0.00608
made,-0.00721,0.00398,0.00221,0.00189,0.00502,0.00927,0.00775,0.00686,1.0,-0.00168,-0.000645,-0.00551,0.0138,0.00569,-0.00787,-0.00543
isNewBuilt,-0.0107,-0.00286,-0.00837,0.000188,0.00246,-0.000224,-0.00187,-0.0174,-0.00168,1.0,0.0032,-0.016,0.0201,0.00275,0.00701,0.0199


In [8]:
#OLS regression
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')

training score = 0.9999995647424506
testing score = 0.9999995640450483
MSE train = 3628693.3065122473
MSE test = 3579048.2974175224


In [9]:
#kNN regression
model = KNeighborsRegressor(n_neighbors=5, metric='euclidean')
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')

training score = 0.9995415967132242
testing score = 0.9993693826862334
MSE train = 3821656719.1349316
MSE test = 5177162948.909237


In [15]:
#lasso regression
model = linear_model.Lasso(max_iter=2000)
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')
print(f'coefficients = {model.coef_}')

training score = 0.9999995647400114
testing score = 0.9999995640294094
MSE train = 3628713.6412790245
MSE test = 3579176.6880394295
coefficients = [ 1.00000164e+02 -5.70522368e-02  3.00934289e+03  2.98187141e+03
  5.43680462e+01 -8.43634974e-04  5.33938599e+01 -2.97155614e+00
 -1.11867029e+00  1.08817727e+02  1.24877209e+02 -1.75901789e-04
 -8.08140944e-03  1.14279869e-01  5.90596549e+00 -9.14783933e+00]


In [14]:
#ridge regression
model = linear_model.Ridge()
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')
print(f'coefficients = {model.coef_}')

training score = 0.9999995647423179
testing score = 0.9999995640571346
MSE train = 3628694.4123696517
MSE test = 3578949.073229415
coefficients = [ 1.00000165e+02 -5.74626438e-02  3.01184252e+03  2.98427544e+03
  5.43697416e+01 -8.44035948e-04  5.35120030e+01 -3.11392225e+00
 -1.13161782e+00  1.12765900e+02  1.28809549e+02 -1.50176586e-04
 -8.09188207e-03  1.14080012e-01  9.99553728e+00 -9.26475732e+00]


In [None]:
#kernel regression