### Benchmark model with 10 best features

In [2]:
import pandas as pd
from helper_functions import *
import importlib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression

In [6]:
# This time we train on the complete set
train_df = pd.read_csv('data/train.csv')
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

# We remove categorical features first
X = X.select_dtypes(exclude=['object'])

# Remove columns with null values
X = X.drop(X.columns[X.isnull().any(axis=0)], axis=1)

# Because of the skewed distribution we take the log of the sale price
y = np.log1p(y)

In [7]:
# The ten best features are selected
sel = SelectKBest(f_regression, k=10)
sel.fit(X, y)

SelectKBest(k=10, score_func=<function f_regression at 0x000001555546FBF8>)

In [8]:
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': sel.scores_})
feature_scores.sort_values('Score', ascending=False).head(10)

Unnamed: 0,Feature,Score
3,OverallQual,2930.799393
14,GrLivArea,1408.121694
23,GarageCars,1258.349493
24,GarageArea,1071.7338
10,TotalBsmtSF,873.711794
11,1stFlrSF,807.335413
17,FullBath,798.100328
5,YearBuilt,764.779835
6,YearRemodAdd,685.840721
21,TotRmsAbvGrd,582.89432


In [9]:
from sklearn.model_selection import train_test_split

# We make a train and a test split
top_columns = feature_scores.sort_values('Score', ascending=False).head(10)['Feature']
X_train, X_test, y_train, y_test = train_test_split(X[top_columns], y, test_size=.3, random_state=42)

In [10]:
# Now we perform a simple linear regression
reg = LinearRegression(normalize=True)
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [11]:
# Make predictions
y_hat = reg.predict(X_test)

In [16]:
print_benchmark(y_test, y_hat)

R2-score: 0.834722700415
RMSE (log): 0.1674463871816331


In [12]:
# We now train the model on the whole test set 
reg = LinearRegression(normalize=True)
reg.fit(X[top_columns], y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [13]:
# Make prediction on the test set
test_df = pd.read_csv('data/test.csv')
X = test_df[top_columns]

In [15]:
# We handle some null values in the test set
X.loc[X['GarageArea'].isnull(), 'GarageArea'] = 0
X.loc[X['GarageCars'].isnull(), 'GarageCars'] = 0
X.loc[X['TotalBsmtSF'].isnull(), 'TotalBsmtSF'] = np.mean(X['TotalBsmtSF'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
y_hat = reg.predict(X)

In [344]:
write_submission(test_df, y_hat)

File written to C:\Source\predicting-house-prices\submissions\20180710220107.csv


Kaggle score:  0.16473