In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
rcParams['figure.figsize'] = 12,10
import seaborn as sb

In [16]:
df = pd.read_csv('F:/Data_Science/Datasets/Ames_cleaned.csv')
df.shape

(2919, 71)

In [17]:
train = df.loc[df['source'] == 'train']
test = df.loc[df['source'] == 'test']
print(train.shape, test.shape)

(1460, 71) (1459, 71)


In [18]:
num_feat = train.dtypes[train.dtypes != 'object'].index
print('Total of numeric features: ', len(num_feat))
cat_feat = train.dtypes[train.dtypes == 'object'].index
print('Total of categorical features: ', len(cat_feat))

Total of numeric features:  30
Total of categorical features:  41


In [19]:
target = 'SalePrice'

In [5]:
#We will select numeric featues based on correlation > 30%
pred_num = ['SalePrice','OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','TotalFullBath',
           'TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea','Fireplaces','BsmtFinSF1','LotFrontage','WoodDeckSF',
           '2ndFlrSF','Porch']
corr = df[pred_num].corr()
sb.heatmap(corr, vmax=1, square=True)

KeyError: "['TotalFullBath'] not in index"

In [20]:
#Feature Engineering
df.columns

Index(['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BldgType', 'BsmtCond',
       'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1',
       'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'MSSubClass', 'MSZoning', 'MasVnrArea',
       'MasVnrType', 'MoSold', 'Neighborhood', 'OverallCond', 'OverallQual',
       'PavedDrive', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SalePrice',
       'SaleType', 'Street', 'TotRmsAbvGrd', 'TotalBsmtSF', 'Utilities',
       'WoodDeckSF', 'YearBui

In [22]:
df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']

In [23]:
#total rooms
df['TotalRooms'] = df['BedroomAbvGr'] + df['TotRmsAbvGrd']
df.shape

(2919, 73)

In [24]:
# Drop Feature engineered columns
col_fe = ['1stFlrSF','2ndFlrSF','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BedroomAbvGr','TotRmsAbvGrd']
df.drop(col_fe, axis=1, inplace=True)
df.shape

(2919, 65)

In [26]:
train = df.loc[df['source'] == 'train']
test = df.loc[df['source'] == 'test']
print(train.shape, test.shape)

(1460, 65) (1459, 65)


In [27]:
num_feat = train.dtypes[train.dtypes != 'object'].index
print('Total of numeric features: ', len(num_feat))
cat_feat = train.dtypes[train.dtypes == 'object'].index
print('Total of categorical features: ', len(cat_feat))

Total of numeric features:  24
Total of categorical features:  41


In [28]:
target = 'SalePrice'

In [29]:
corr = train.corr()
corr_abs = corr.abs()

nr_num_cols = len(num_feat)

ser_corr = corr_abs.nlargest(nr_num_cols, target)[target]
print(ser_corr)

SalePrice       1.000000
OverallQual     0.790982
TotalSF         0.782260
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
FullBath        0.582934
YearBuilt       0.522897
YearRemodAdd    0.507101
MasVnrArea      0.473461
Fireplaces      0.466929
TotalRooms      0.444828
LotFrontage     0.334820
WoodDeckSF      0.324413
Porch           0.296678
LotArea         0.263843
GarageYrBlt     0.261366
HalfBath        0.250628
KitchenAbvGr    0.135907
MSSubClass      0.084284
OverallCond     0.077856
MoSold          0.046432
YrSold          0.028923
Id              0.021917
Name: SalePrice, dtype: float64


In [34]:
# Highest value Frequency percentage in categorical variables 
for i in list(cat_feat):
    pct = df[i].value_counts()[0] / 2919
    print('Highest value Percentage of {}: {:3f}'.format(i, pct))

Highest value Percentage of BldgType: 0.830764
Highest value Percentage of BsmtCond: 0.892771
Highest value Percentage of BsmtExposure: 0.652278
Highest value Percentage of BsmtFinType1: 0.318602
Highest value Percentage of BsmtFinType2: 0.881466
Highest value Percentage of BsmtQual: 0.439534
Highest value Percentage of CentralAir: 0.932854
Highest value Percentage of Condition1: 0.860226
Highest value Percentage of Condition2: 0.989723
Highest value Percentage of Electrical: 0.915382
Highest value Percentage of ExterCond: 0.869476
Highest value Percentage of ExterQual: 0.615964
Highest value Percentage of Exterior1st: 0.351490
Highest value Percentage of Exterior2nd: 0.347722
Highest value Percentage of Fence: 0.804385
Highest value Percentage of FireplaceQu: 0.486468
Highest value Percentage of Foundation: 0.448099
Highest value Percentage of Functional: 0.931483
Highest value Percentage of GarageCond: 0.909215
Highest value Percentage of GarageFinish: 0.421377
Highest value Percenta

In [32]:
#Drop columns based on corr < 30%
col_drop = ['Porch','LotArea','GarageYrBlt','HalfBath','KitchenAbvGr','MSSubClass','OverallCond','MoSold','YrSold']
df.drop(col_drop, axis=1, inplace=True)
df.shape

(2919, 56)

In [66]:
# Drop columns which have frequency of value more than 80% of all values 
col_drop = ['BldgType','BsmtCond','BsmtFinType2','CentralAir','Condition1','Condition2','Electrical','ExterCond',
           'Fence','Functional','GarageCond','GarageQual','Heating','LandContour','LandSlope','PavedDrive','RoofMatl',
           'SaleCondition','SaleType','Street','Utilities']
df.drop(col_drop, axis=1, inplace=True)
df.shape

(2919, 35)

In [67]:
df.columns

Index(['BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'FireplaceQu', 'Fireplaces', 'Foundation', 'FullBath',
       'GarageArea', 'GarageCars', 'GarageFinish', 'GarageType', 'GrLivArea',
       'HeatingQC', 'HouseStyle', 'Id', 'KitchenQual', 'LotConfig',
       'LotFrontage', 'LotShape', 'MSZoning', 'MasVnrArea', 'MasVnrType',
       'Neighborhood', 'OverallQual', 'RoofStyle', 'SalePrice', 'WoodDeckSF',
       'YearBuilt', 'YearRemodAdd', 'source', 'TotalRooms', 'TotalSF'],
      dtype='object')

In [68]:
col_encod = ['BsmtExposure','BsmtFinType1','BsmtQual','ExterQual','Exterior1st','Exterior2nd','FireplaceQu',
            'Foundation','GarageFinish','GarageType','HeatingQC','HouseStyle','KitchenQual','LotConfig','LotShape','MSZoning',
            'MasVnrType','Neighborhood','RoofStyle','YearBuilt','YearRemodAdd']
df = pd.get_dummies(df, columns=col_encod)
df.shape

(2919, 333)

In [69]:
#Dividing back into test and train dataset
train = df.loc[df['source'] == 'train']
test = df.loc[df['source'] == 'test']
print(train.shape, test.shape)

(1460, 333) (1459, 333)


In [70]:
test.drop(['source'], axis = 1, inplace=True)
train.drop(['source'], axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer


In [18]:
#split the data to train the model
y = train.SalePrice
X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1022, 347), (438, 347), (1022,), (438,))

In [75]:
predictors = [x for x in train.columns if x not in ['SalePrice', 'Id']]
target = 'SalePrice'

In [36]:
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(n_folds,shuffle=True,random_state=42)
    rmse = np.sqrt(cross_val_score(model,X,y,scoring ="mean_squared_error",cv=kf, n_jobs=-1))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(n_folds,shuffle=True,random_state=42)
    rmse = np.sqrt(cross_val_score(model,X_test,y_test,scoring ="mean_squared_error",cv=kf, n_jobs=-1))
    return (rmse)

In [27]:
lr = LinearRegression()
lr.fit(X,y)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train is {:.4f}'.format(rmse_CV_train(lr).mean()))
print('rmse on test is {:.4f}'.format(rmse_CV_test(lr).mean()))

rmse on train is 0.0001
rmse on test is 0.0000


In [28]:
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
ridge.fit(X_train,y_train)
alpha = ridge.alpha_
print('best alpha',alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :{:.4f}".format(rmse_CV_train(ridge).mean()))
print("Ridge RMSE on Test set :{:.4f}".format(rmse_CV_test(ridge).mean()))
y_train_rdg = ridge.predict(X_train)
y_test_rdg = ridge.predict(X_test)

best alpha 0.01
Try again for more precision with alphas centered around 0.01
Best alpha : 0.006
Ridge RMSE on Training set :0.0000
Ridge RMSE on Test set :0.0000


In [76]:
#Fit algorithm on data
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(train[predictors], train[target])

#predict training set
train_pred = lin_reg.predict(train[predictors])


kf = KFold(n_splits=5, shuffle=True, random_state=1)
cv_results = cross_val_score(lin_reg, train[predictors], train[target], cv=kf, scoring='mean_squared_error', n_jobs=-1)
cv_score = np.sqrt(np.abs(cv_results))

 #Print model report:
print ('\nModel Report')
print ("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(train[target].values, train_pred)))
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
print("Score: ", lin_reg.score(train[predictors], train[target]))


Model Report
RMSE : 2.659e+04
CV Score : Mean - 4.537e+17 | Std - 4.293e+17 | Min - 6.866e+13 | Max - 1.253e+18
Score:  0.887904054097


In [41]:
test[target] = lin_reg.predict(test[predictors])
test[['Id','SalePrice']].to_csv('C:/Hari Docs/Dataset/Ames_predict_1.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
