In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LassoLars
from sklearn.metrics import r2_score, mean_squared_log_error

In [2]:
train_data = pd.read_csv('../data/train.csv', delimiter=',').drop(['Id'], axis=1)

In [3]:
def split_target_numerical_and_categorical_columns(df, target_name):
    target_column = train_data[target_name]
    numerical_columns = train_data.drop(columns=[target_name]).select_dtypes(include=[np.number])
    categorical_columns = train_data.drop(columns=[target_name]).select_dtypes(exclude=[np.number])
    return target_column, numerical_columns, categorical_columns

In [4]:
def one_hot_encode_categorical_data(categorical_data):
    return pd.get_dummies(categorical_data)

In [5]:
def filter_columns_corr_more_than_x(x, df, target_series, target_name):
    df_and_target = pd.concat([df, train_data[target_name]], axis=1)
    df_and_target_corr = df_and_target.corr().abs().drop(columns=[target_name])
    return df.loc[:, (df_and_target_corr.loc[target_name,:]>=x)]

In [6]:
target_column, numerical_columns, categorical_columns = split_target_numerical_and_categorical_columns(train_data, 'SalePrice')
print(target_column.shape)
print(numerical_columns.shape)
print(categorical_columns.shape)

(1460,)
(1460, 36)
(1460, 43)


In [7]:
non_number_data_description = categorical_columns.describe()
non_number_data_description

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


In [8]:
one_hot_categorical_columns = one_hot_encode_categorical_data(categorical_columns)
one_hot_categorical_columns.sum(axis=0).head()

MSZoning_C (all)      10
MSZoning_FV           65
MSZoning_RH           16
MSZoning_RL         1151
MSZoning_RM          218
dtype: int64

In [57]:
scaler_cat = MinMaxScaler()
scaled_cat_train_data = scaler_cat.fit_transform(one_hot_categorical_columns.fillna(0))
scaled_cat_train_data = pd.DataFrame(scaled_cat_train_data, columns=one_hot_categorical_columns.columns)

scaler_num = MinMaxScaler()
scaled_num_train_data = scaler_num.fit_transform(numerical_columns.fillna(0))
scaled_num_train_data = pd.DataFrame(scaled_num_train_data, columns=numerical_columns.columns)

In [58]:
train_cat_data = filter_columns_corr_more_than_x(0.1, scaled_cat_train_data, train_data['SalePrice'], 'SalePrice')
num_cat_columns = train_cat_data.columns
print(train_cat_data.shape)

train_num_data = filter_columns_corr_more_than_x(0.1, scaled_num_train_data, train_data['SalePrice'], 'SalePrice')
num_data_columns = train_num_data.columns
print(train_num_data.shape)

processed_train_data = pd.concat([train_cat_data, train_num_data], axis=1).fillna(0)
print(processed_train_data.shape)
print(type(processed_train_data))

(1460, 107)
(1460, 26)
(1460, 133)
<class 'pandas.core.frame.DataFrame'>


## Feature Selection

In [41]:
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [88]:
def chi2_feature_selection(K, X, y):
    test = SelectKBest(score_func=chi2, k=K)
    fit = test.fit(X, y)
    np.set_printoptions(precision=3)
    score = list(zip(fit.scores_, X.columns.values))
    sorted_score = sorted(score, reverse=True)[:K]
    chosen_columns = [x[1] for x in a]
    return sorted_score, X.loc[:, chosen_columns]

In [89]:
def RFE_feature_selection(K, X, y):
    model = LogisticRegression()
    rfe = RFE(model, K)
    fit = rfe.fit(X, y)
    return X.loc[:, fit.support_].columns, X.loc[:, fit.support_]

In [90]:
def PCA_feature_selection(K, X):
    pca = PCA(n_components=K)
    fit = pca.fit(X)
    return fit.explained_variance_ratio_, fit.components_

In [91]:
def tree_feature_importance(K, X, y):
    model = ExtraTreesClassifier()
    model.fit(X, y)
    score = list(zip(model.feature_importances_, X.columns.values))
    sorted_score = sorted(score, reverse=True)[:K]
    chosen_columns = [x[1] for x in a]
    return sorted_score, X.loc[:, chosen_columns]

In [95]:
data_for_feature_extraction = pd.concat([one_hot_categorical_columns, numerical_columns], axis=1).fillna(0)

_, chi2_features = chi2_feature_selection(3, processed_train_data, train_data['SalePrice'])
#RFE_feature_selection(5, processed_train_data, train_data['SalePrice'])
_, PCA_features = PCA_feature_selection(3, processed_train_data)
_, tree_features = tree_feature_importance(3, processed_train_data, train_data['SalePrice'])

In [96]:
print(chi2_features.shape)
print(PCA_features.shape)
print(tree_features.shape)
print(PCA_features)

(1460, 5)
(3, 133)
(1460, 5)
[[-5.477e-03  5.643e-02 -7.433e-02 -2.421e-02  8.027e-02  1.029e-02
  -9.324e-02 -2.228e-02  1.016e-02  1.827e-02 -7.499e-03 -2.618e-02
  -2.746e-02 -1.905e-02 -4.433e-03 -7.257e-02  2.448e-02  4.656e-02
  -5.145e-02 -2.332e-02  4.222e-02  1.264e-02  1.395e-02 -2.224e-02
  -2.155e-02  4.025e-02 -1.504e-03 -1.905e-02 -6.275e-02  8.813e-02
  -8.284e-03  1.477e-02  1.554e-04  4.875e-04 -9.485e-03  9.832e-03
  -6.427e-02  2.099e-01 -7.145e-02  1.029e-02 -6.167e-02  2.076e-01
  -7.272e-02  7.485e-02 -1.383e-01  6.374e-02  2.447e-02 -5.385e-03
   2.417e-01 -2.608e-01 -1.260e-02  3.800e-02 -6.597e-02 -1.902e-01
   2.655e-01 -8.033e-03  6.555e-02 -1.482e-02  2.022e-01 -2.402e-01
  -1.988e-02  2.057e-02  6.065e-02  3.242e-02 -8.540e-02 -4.060e-02
  -4.556e-02  1.666e-01 -4.956e-02  4.618e-02  2.175e-01 -1.993e-02
  -4.221e-02 -1.547e-01 -4.830e-02  4.830e-02 -4.502e-02 -1.318e-02
   5.999e-02  4.299e-02 -1.547e-02  2.124e-01 -2.399e-01  2.889e-02
   1.097e-02  7.094

['1stFlrSF', 'LotArea', 'GrLivArea', 'BsmtUnfSF', 'TotalBsmtSF']

## Split data to train and validation

In [None]:
split_on_row = int(0.8*processed_train_data.shape[0])
X_train = processed_train_data.iloc[:split_on_row, :]
y_train = train_data.iloc[:split_on_row, :]['SalePrice']
X_validation = processed_train_data.iloc[split_on_row:, :]
y_validation = train_data.iloc[split_on_row:, :]['SalePrice']

## Train model

In [None]:
models = {'Ridge Regression': Ridge(alpha=0.1, fit_intercept=True, normalize=True), 
            'Lasso': Lasso(alpha=2), 
            'Elastic Net': ElasticNet(alpha=0.1),
            'LARS Lasso': LassoLars(alpha=3),
            'Bagging Regressor': BaggingRegressor()
         }

In [None]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def predict_with_model(model, X_validation):
    return model.predict(X_validation)

def calc_r2_score(y_validation, prediction):
    return round(r2_score(y_validation, prediction),4)

def calc_RMSE_score(y_validation, prediction):
    RMSE_score = np.sqrt(mean_squared_log_error(y_validation, prediction, sample_weight=None, multioutput='uniform_average'))
    return round(RMSE_score,4)

def train_predict_get_score(model, X_train, y_train, X_validation, y_validation):
    model = train_model(model, X_train, y_train)
    prediction = pd.Series(predict_with_model(model, X_validation))
    model_r2 = calc_r2_score(y_validation, prediction)
    return model_r2, calc_RMSE_score(prediction, y_validation)

In [None]:
def create_score_for_models(models, X_train, y_train, X_validation, y_validation):
    score_of_models = {}
    for name, model in models.items():
        models_r2_score, models_RMSE_score = train_predict_get_score(model, X_train, y_train, X_validation, y_validation)
        score_of_models[name] = {'r2 score': models_r2_score, 'RMSE score': models_RMSE_score}
    return score_of_models

In [None]:
score_train_predict_data_equal = create_score_for_models(models, X_train, y_train, X_train, y_train)
print(score_train_predict_data_equal)
score_on_validation_data = create_score_for_models(models, X_train, y_train, X_validation, y_validation)
print(score_on_validation_data)

In [None]:
def run_knn(n_neighbors, X_train, y_train, X_validation, y_validation):
    score_on_train = {}
    score_on_validation = {}

    for i, weights in enumerate(['uniform', 'distance']):
        nn_model = KNeighborsRegressor(n_neighbors, weights = weights, p=1)
        nn_model.fit(X_train, y_train)
        train_prediction = nn_model.predict(X_train)
        score_on_train[weights] = r2_score(y_train, train_prediction)

        nn_prediction = nn_model.predict(X_validation)
        score_on_validation[weights] = np.sqrt(mean_squared_log_error(y_validation, nn_prediction, sample_weight=None, multioutput='uniform_average'))#r2_score(y_validation, nn_prediction)
    return score_on_validation, nn_prediction

In [None]:
knn_r2 = pd.DataFrame(columns = ['no_of_neighbours', 'r2'])
for n_neighbors in range(1,21):
    score, prediction = run_knn(n_neighbors, X_train, y_train, X_validation, y_validation)
    result = pd.DataFrame([[n_neighbors, score['distance']]],
                    columns = ['no_of_neighbours', 'r2'])
    knn_r2 = knn_r2.append(result)

f, ax = plt.subplots(figsize=(10, 8))
ax.plot(knn_r2['no_of_neighbours'], knn_r2['r2'], '-bo')
ax.grid(True)

In [None]:
n_neighbors = 14
nn_regression_r2, nn_regression_prediction = run_knn(n_neighbors, X_train, y_train, X_validation, y_validation)
print(np.sqrt(mean_squared_log_error(y_validation, nn_regression_prediction, sample_weight=None, multioutput='uniform_average')))