In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
rcParams['figure.figsize'] = 10,8
import seaborn as sb

In [2]:
train = pd.read_csv('F:/Data_Science/Datasets/houseprices_train.csv')
test = pd.read_csv('F:/Data_Science/Datasets/houseprices_test.csv')
print(train.shape, test.shape)

(1460, 82) (1459, 80)


In [3]:
#Adding source
train['source'] = 'train'
test['source'] = 'test'
print(train.shape, test.shape)

(1460, 83) (1459, 81)


In [4]:
df = pd.concat([train, test])
df.shape

(2919, 83)

In [5]:
#Removing unwanted columns
del_col = ['Alley','1stFlrSF','2ndFlrSF','Fireplaces','FireplaceQu','LotFrontage','LandContour','BldgType','HouseStyle'
           ,'RoofStyle','RoofMatl','Exterior1st'
           ,'Exterior2nd','MasVnrType','MasVnrArea','ExterCond','Foundation','BsmtCond','BsmtExposure','BsmtFinType1'
           ,'BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','Heating','CentralAir','Electrical','LowQualFinSF'
           ,'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','KitchenAbvGr','Functional','GarageType','GarageYrBlt'
           ,'GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF'
           ,'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','PoolQC','Fence','MiscFeature','MiscVal','SaleType'
           ,'SaleCondition']
df.drop(del_col, axis=1, inplace=True)
df.shape

(2919, 29)

In [6]:
#All null values of BsmtQual is no Basement
rows = df['BsmtQual'].isnull()
print ('Before: ', df['BsmtQual'].isnull().sum())
df.loc[rows,'BsmtQual'] = 'None'
print ('After: ', df['BsmtQual'].isnull().sum())

Before:  81
After:  0


In [7]:
#TotalBsmtSF = 0 values
zero = (df['BsmtQual'] == 'None')
print ('Before:', df['TotalBsmtSF'].isnull().sum())
df.loc[zero, 'TotalBsmtSF'] = 0
print ('After:', df['TotalBsmtSF'].isnull().sum())

Before: 1
After: 0


In [8]:
#Filling missing values for KitchenQual, MSZoning, Utilities
df['KitchenQual'].fillna(df['KitchenQual'].mode()[0], inplace= True)

df['MSZoning'].fillna(df['MSZoning'].mode()[0], inplace= True)

df['Utilities'].fillna(df['Utilities'].mode()[0], inplace= True)

In [9]:
#Removing few categorical variable because most of them have 90% same value
df.drop(['Condition1', 'Condition2', 'LandSlope', 'Street', 'Utilities'], axis=1, inplace=True)
df.shape

(2919, 24)

In [10]:
rows = df['YearRemodAdd'] > df['YrSold'] 
df.loc[rows,'YrSold'] = 2008
df.loc[rows,['YearRemodAdd','YrSold','YearBuilt']]

Unnamed: 0,YearRemodAdd,YrSold,YearBuilt
523,2008,2008,2007
835,2008,2008,2007
1089,2009,2008,2008


In [18]:
rows = df['YearRemodAdd'] > df['YrSold']
df.loc[rows,'YearRemodAdd'] = 2008
df.loc[rows,['YearRemodAdd','YrSold','YearBuilt']]


Unnamed: 0,YearRemodAdd,YrSold,YearBuilt
1089,2008,2008,2008


In [19]:
df['BlgAge'] = df['YrSold'] - df['YearRemodAdd']

In [20]:
df.to_csv('F:/Data_Science/Datasets/ames_cleaned.csv')

In [21]:
#dropping MoSold and YrSold since we calculated the age
df.drop(['MoSold','YrSold'], axis=1, inplace=True)
df.shape

(2919, 23)

In [22]:
#Integer conversions (Label Encoder)
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()

In [23]:
cat = df.dtypes[df.dtypes == 'object'].index
cat

Index(['BsmtQual', 'ExterQual', 'HeatingQC', 'KitchenQual', 'LotConfig',
       'LotShape', 'MSZoning', 'Neighborhood', 'source'],
      dtype='object')

In [24]:
#Convert object type variables into integers
obj = ['BsmtQual', 'ExterQual', 'HeatingQC', 'KitchenQual', 'LotConfig',
       'LotShape', 'MSZoning', 'Neighborhood']
for i in obj:
    df[i] = lc.fit_transform(df[i])
    
df.shape

(2919, 23)

In [31]:
df.drop('BlgAge', axis=1, inplace=True)
df.shape

(2919, 22)

In [34]:
#One hot encoding
#df.columns
col_encod = ['BedroomAbvGr', 'BsmtQual', 'ExterQual', 'HeatingQC', 'KitchenQual', 'LotConfig', 'LotShape',
       'MSSubClass', 'MSZoning', 'Neighborhood', 'OverallCond', 'OverallQual',
       'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']
df = pd.get_dummies(df, columns=col_encod)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Columns: 300 entries, BldgAge to YearRemodAdd_2010
dtypes: float64(3), int64(3), object(1), uint8(293)
memory usage: 1017.7+ KB


In [36]:
#Dividing back into test and train dataset
train = df.loc[df['source'] == 'train']
test = df.loc[df['source'] == 'test']
print(train.shape, test.shape)

(1460, 300) (1459, 300)


In [37]:
test.drop(['source'], axis = 1, inplace=True)
train.drop(['source'], axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [38]:
#Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

In [43]:
target = 'SalePrice'
predictors = [x for x in train.columns if x not in ['SalePrice', 'Id']]

In [49]:
#Fit algorithm on data
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(train[predictors], train[target])

#predict training set
train_pred = lin_reg.predict(train[predictors])


kf = KFold(n_splits=10, shuffle=True, random_state=1)
cv_results = cross_val_score(lin_reg, train[predictors], train[target], cv=kf, scoring='mean_squared_error', n_jobs=-1)
cv_score = np.sqrt(np.abs(cv_results))

 #Print model report:
print ('\nModel Report')
print ("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(train[target].values, train_pred)))
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
print("Score: ", lin_reg.score(train[predictors]))


Model Report
RMSE : 2.789e+04
CV Score : Mean - 7.881e+17 | Std - 7.306e+17 | Min - 2.694e+04 | Max - 2.116e+18


TypeError: score() missing 1 required positional argument: 'y'

In [50]:
print("Score: ", lin_reg.score(train[predictors], train[target]))

Score:  0.876650464421


In [64]:
cv_results = cross_val_score(lin_reg, train[predictors], train[target], cv=kf, scoring='r2', n_jobs=-1)
cv_results = cv_results.mean()

In [65]:
'{:0.4}'.format(cv_results)

'-1.796e+26'