In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm, linear_model, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel

In [2]:
df = pd.read_csv('data/ParisHousing.csv') #loads data

In [3]:
pd.set_option('display.precision', 2) #formatting for printed output
#remove binary features from summary
df.drop(columns = ['hasYard', 'hasPool', 'isNewBuilt', 'hasStormProtector',
                   'hasStorageRoom', 'hasGuestRoom']).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
squareMeters,10000.0,49900.0,28800.0,89.0,25100.0,50100.0,74600.0,100000.0
numberOfRooms,10000.0,50.4,28.8,1.0,25.0,50.0,75.0,100.0
floors,10000.0,50.3,28.9,1.0,25.0,50.0,76.0,100.0
cityCode,10000.0,50200.0,29000.0,3.0,24700.0,50700.0,75700.0,100000.0
cityPartRange,10000.0,5.51,2.87,1.0,3.0,5.0,8.0,10.0
numPrevOwners,10000.0,5.52,2.86,1.0,3.0,5.0,8.0,10.0
made,10000.0,2010.0,9.31,1990.0,2000.0,2010.0,2010.0,2020.0
basement,10000.0,5030.0,2880.0,0.0,2560.0,5090.0,7510.0,10000.0
attic,10000.0,5030.0,2890.0,1.0,2510.0,5040.0,7540.0,10000.0
garage,10000.0,553.0,262.0,100.0,328.0,554.0,777.0,1000.0


Classification on cityPartRange (1-10 value representing the desirability of a neighborhood)

In [4]:
X = df.drop(columns='cityPartRange')
y = df.cityPartRange
# split 50% training data, 50% "_tmp" for validation & test
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.5,
                                                  random_state=0, stratify=y)
# of remaining 50%, split in half to get 25% validation, 25% test
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
  random_state=0, stratify=y_tmp)

In [5]:
def find_best_clf(clfs, params, X_train, X_valid, y_train, y_valid, only_best = True):
  #combines clfs and gridsearch params into a dataframe
  models = pd.DataFrame({'clf':clfs, 'params':params})
  #creates a grid search of each clf with associated params
  models['gridsearch'] = models.apply(lambda x: GridSearchCV(x['clf'], x['params'], n_jobs =-1, refit=True), 
                                      axis = 1)
  #fits each grid search to training data
  models.apply(lambda x: x['gridsearch'].fit(X_train, y_train), axis=1)
  #finds accuracy score of each clf on validation data
  models['accuracy'] = models.apply(lambda x: x['gridsearch'].score(X_valid, y_valid), axis = 1)
  #sorts models by accuracy
  models.sort_values('accuracy', ascending=False, inplace=True)
  #returns estimator with highest validation accuracy if only_best is true
  #else returns dataframe of all gridsearchs and acurracies sorted by accuracy
  return models.iloc[0].gridsearch.best_estimator_ if only_best else models[['gridsearch','accuracy']]

In [None]:
clfs = [DecisionTreeClassifier(random_state=0),
       Pipeline(steps=[('pca', PCA()), ('scaler', StandardScaler()), 
                       ('knn', KNeighborsClassifier())]),
        Pipeline(steps=[('pca', PCA()), ('scaler', StandardScaler()), 
                        ('reg', linear_model.LogisticRegression(max_iter=1000))])
      ]
params = [
        {
            'max_depth': np.arange(1,20,2), 
            'min_samples_split': np.arange(2,20,2),
            'min_samples_leaf': np.arange(1,20,2)
        },
        {
            'pca__n_components': np.arange(1,10),
            'knn__n_neighbors': np.arange(1,20,2),
            'knn__leaf_size': np.arange(1,20,2),
            'knn__p':[1,2]
        },
        {
            'pca__n_components': np.arange(1,10),
            'reg__C': [.01, 1, 100, 1000],
            'reg__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
        }
         ]
grid_search = find_best_clf(clfs, params, X_train, X_valid, y_train, y_valid, False)

In [None]:
grid_search

Regression on price

In [None]:
#X = df.drop(columns='price') change to the variables below (from r)
X = df[["squareMeters"]]
y = df.price
# split 80% training data, 20% "_tmp" for validation & test
X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=.2,
                                                  random_state=0)
# of remaining 20%, split in half to get 10% validation, 10% test
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5,
  random_state=0)

In [None]:
# tried only squareMeters and [squareMeters,other significant variables] and the MSE's still crazy and the accuracy is pretty much the same
# after plotting price vs squareMeters, the relationship is straight up linear
X.corr()
df.isnull().sum()
plt.scatter(X,y)
# the high MSE is probably from the nature of the price values in the data
# also this explains the ungodly high accuracy

In [None]:
#OLS regression
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')

In [None]:
#kNN regression
model = KNeighborsRegressor(n_neighbors=5, metric='euclidean')
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')

In [None]:
#lasso regression
model = linear_model.Lasso(max_iter=2000)
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')
print(f'coefficients = {model.coef_}')

In [None]:
#ridge regression
model = linear_model.Ridge(alpha=0.1)
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)

print(f'training score = {score_train}\ntesting score = {score_test}')
print(f'MSE train = {MSE_train}\nMSE test = {MSE_test}')
print(f'coefficients = {model.coef_}')

In [None]:
#kernel regression