In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [4]:
ames = pd.read_csv('train.csv')
ames_test = pd.read_csv('test.csv')

In [5]:
g = ames.columns.to_series().groupby(ames.dtypes).groups
d = {k.name: v for k, v in g.items()}
d

{'float64': Index(['Lot Frontage', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
        'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area'],
       dtype='object'),
 'int64': Index(['Id', 'PID', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond',
        'Year Built', 'Year Remod/Add', '1st Flr SF', '2nd Flr SF',
        'Low Qual Fin SF', 'Gr Liv Area', 'Full Bath', 'Half Bath',
        'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
        'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
        'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold',
        'SalePrice'],
       dtype='object'),
 'object': Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
        'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
        'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
        'Exterior 1st', 'Exterior 2n

In [6]:
ames_num = ames[['Lot Frontage', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
        'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Id', 'PID', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond',
        'Year Built', 'Year Remod/Add', '1st Flr SF', '2nd Flr SF',
        'Low Qual Fin SF', 'Gr Liv Area', 'Full Bath', 'Half Bath',
        'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
        'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
        'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold' ]]

ames_cat = ames[['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
        'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
        'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
        'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
        'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
        'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
        'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
        'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
        'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
        'Sale Type', 'Sale Condition']]

In [7]:
ames_cat['Sale Condition'].value_counts()

Normal     1696
Partial     164
Abnorml     132
Family       29
Alloca       19
AdjLand      11
Name: Sale Condition, dtype: int64

In [8]:
# 1=abnormal and 0=not abnormal

In [9]:
ames_cat['Sale Condition'] = [1 if x=='Abnorml' else 0 for x in ames_cat['Sale Condition']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
ames_cat['Sale Condition'].value_counts()

0    1919
1     132
Name: Sale Condition, dtype: int64

In [11]:
X = ames_num
y = ames_cat['Sale Condition']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1538, 38) (513, 38) (1538,) (513,)


In [13]:
X_train.isnull().sum()[X_train.isnull().sum()>0]

Lot Frontage      253
Mas Vnr Area       16
BsmtFin SF 1        1
BsmtFin SF 2        1
Bsmt Unf SF         1
Total Bsmt SF       1
Bsmt Full Bath      2
Bsmt Half Bath      2
Garage Yr Blt      83
Garage Cars         1
Garage Area         1
dtype: int64

In [14]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')

X_train = imputer.fit_transform(X_train)

In [15]:
X_train

array([[   50.,     0.,   452., ...,     0.,     4.,  2010.],
       [   80.,   251.,  1271., ...,     0.,     4.,  2008.],
       [   94.,     0.,     0., ...,     0.,     4.,  2010.],
       ..., 
       [   90.,     0.,   315., ...,     0.,    10.,  2009.],
       [   68.,   450.,   194., ...,     0.,     7.,  2006.],
       [  104.,   860.,     0., ...,     0.,     7.,  2009.]])

In [16]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)

X_train

array([[-0.90702082, -0.57681092, -0.01443692, ..., -0.09403024,
        -0.78982578,  1.68453116],
       [ 0.48732651,  0.86220815,  1.74359278, ..., -0.09403024,
        -0.78982578,  0.16097402],
       [ 1.13802192, -0.57681092, -0.98468041, ..., -0.09403024,
        -0.78982578,  1.68453116],
       ..., 
       [ 0.95210895, -0.57681092, -0.30851514, ..., -0.09403024,
         1.37165473,  0.92275259],
       [-0.07041242,  2.00310375, -0.56824847, ..., -0.09403024,
         0.29091448, -1.36258313],
       [ 1.60280436,  4.35369267, -0.98468041, ..., -0.09403024,
         0.29091448,  0.92275259]])

In [17]:
X_train = pd.DataFrame(X_train, columns=X.columns)

In [18]:
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import RFE, SelectKBest, f_classif, f_regression

In [19]:
logreg = LogisticRegression(C=1000000)
logreg.fit(X_train, y_train)

LogisticRegression(C=1000000, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [20]:
logreg.coef_.shape

(1, 38)

In [21]:
sx_list = []
for i, col in enumerate(X_train):
    if logreg.coef_[0,i]!=0:
        sx_list.append(col)
        print(col, logreg.coef_[0,i])

Lot Frontage -0.0245796150147
Mas Vnr Area 0.191236447639
BsmtFin SF 1 -22.3177071146
BsmtFin SF 2 -7.99418714547
Bsmt Unf SF -21.1697937594
Total Bsmt SF 21.5630845364
Bsmt Full Bath -0.167222849294
Bsmt Half Bath -0.0252668801314
Garage Yr Blt 0.130219449538
Garage Cars -0.30647909692
Garage Area 0.179427414478
Id 0.686674561002
PID -0.186661787499
MS SubClass 0.224667667812
Lot Area 0.0179549695281
Overall Qual -0.320275286094
Overall Cond -0.312858154847
Year Built -0.211098520125
Year Remod/Add -0.135694080267
1st Flr SF 0.239657880114
2nd Flr SF 0.0483852220085
Low Qual Fin SF 0.0716603390259
Gr Liv Area 0.235564496817
Full Bath -0.477142382621
Half Bath -0.265042814258
Bedroom AbvGr -0.0648022538289
Kitchen AbvGr -0.0458807224927
TotRms AbvGrd 0.00445772890154
Fireplaces -0.193948503144
Wood Deck SF -0.240730953267
Open Porch SF 0.161499567486
Enclosed Porch 0.0152277541942
3Ssn Porch 0.0214984778923
Screen Porch -0.106632347536
Pool Area -1.26340940483
Misc Val -0.624125731394


In [22]:
sx_list

['Lot Frontage',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Id',
 'PID',
 'MS SubClass',
 'Lot Area',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val',
 'Mo Sold',
 'Yr Sold']

In [23]:
selected_X= X_train[sx_list]

In [24]:
selected_X.head()

Unnamed: 0,Lot Frontage,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Bsmt Full Bath,Bsmt Half Bath,Garage Yr Blt,Garage Cars,...,Fireplaces,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,-0.907021,-0.576811,-0.014437,-0.216069,-0.943632,-1.023438,1.073722,-0.25419,-1.548494,-1.016855,...,-0.933304,-0.711576,-0.718603,1.803317,-0.106626,-0.286408,-0.065889,-0.09403,-0.789826,1.684531
1,0.487327,0.862208,1.743593,-0.287442,-0.518582,1.196261,1.073722,-0.25419,0.6367,0.283164,...,0.618503,2.458116,-0.125376,-0.388782,-0.106626,-0.286408,-0.065889,-0.09403,-0.789826,0.160974
2,1.138022,-0.576811,-0.98468,-0.287442,0.80203,-0.344577,-0.846493,-0.25419,0.040738,-2.316875,...,-0.933304,0.245312,-0.718603,-0.388782,-0.106626,-0.286408,-0.065889,-0.09403,-0.789826,1.684531
3,-0.070412,1.854042,0.938634,-0.287442,-0.284463,0.588858,1.073722,-0.25419,-0.038723,0.283164,...,0.618503,3.400052,3.463646,-0.388782,-0.106626,-0.286408,-0.065889,-0.09403,0.290914,0.160974
4,-0.070412,-0.576811,0.661728,0.2657,-0.666327,0.133306,-0.846493,3.694739,-0.038723,0.283164,...,-0.933304,0.634048,-0.718603,-0.388782,-0.106626,-0.286408,-0.065889,-0.09403,-1.150073,0.160974


In [25]:
print('All features')
scores = cross_val_score(logreg, X_train, y_train)
print(scores, '\n', 'Mean: ', scores.mean())

print('Feature Selection via Regularization')
scores = cross_val_score(logreg, selected_X, y_train)
print(scores, '\n', 'Mean: ', scores.mean())

All features
[ 0.9337232   0.93177388  0.92773438] 
 Mean:  0.931077150341
Feature Selection via Regularization
[ 0.9337232   0.93177388  0.92773438] 
 Mean:  0.931077150341


In [26]:
#Must impute and standardize X_test

In [27]:
X_test = imputer.transform(X_test)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [28]:
ss = StandardScaler()
ss.fit(X_test)
X_test = ss.transform(X_test)

In [29]:
X_test = pd.DataFrame(X_test, columns=X.columns)
X_test.head()

Unnamed: 0,Lot Frontage,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Bsmt Full Bath,Bsmt Half Bath,Garage Yr Blt,Garage Cars,...,Fireplaces,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,-0.778696,-0.533505,-0.888271,-0.301859,1.02814,0.068631,-0.734695,-0.245322,-1.941232,-1.015501,...,-0.904091,-0.80732,-0.695382,-0.346888,-0.103662,-0.292714,-0.058279,-0.090551,-1.248586,0.201214
1,0.876234,-0.533505,0.786785,-0.301859,-0.98334,-0.329248,1.168833,-0.245322,-1.293734,-1.015501,...,-0.904091,-0.80732,-0.695382,-0.346888,-0.103662,2.900487,-0.058279,-0.090551,-0.114245,-1.328012
2,1.460327,-0.533505,1.035443,-0.301859,-0.830459,0.068631,1.168833,-0.245322,0.64876,0.320273,...,2.327955,0.703342,-0.695382,-0.346888,-0.103662,-0.292714,-0.058279,-0.090551,-0.114245,0.201214
3,-1.216765,-0.533505,2.366897,-0.301859,-1.295654,0.899758,3.072362,-0.245322,-0.085071,-2.351275,...,-0.904091,2.079123,-0.695382,-0.346888,-0.103662,-0.292714,-0.058279,-0.090551,0.641982,-0.563399
4,2.433815,2.172005,2.63138,-0.301859,0.108669,2.579695,1.168833,-0.245322,0.950926,1.656048,...,0.711932,1.063023,0.15704,-0.346888,-0.103662,-0.292714,-0.058279,-0.090551,0.263869,0.965827


In [30]:
selected_x_test = X_test[sx_list]
selected_x_test.shape

(513, 38)

In [31]:
y_test.shape

(513,)

In [32]:
print('All features')
scores = cross_val_score(logreg, X_test, y_test)
print(scores, '\n', 'Mean: ', scores.mean())


print('Feature Selection via Regularization')
scores = cross_val_score(logreg, selected_x_test, y_test)
print(scores, '\n', 'Mean: ', scores.mean())

All features
[ 0.92982456  0.89473684  0.94152047] 
 Mean:  0.922027290448
Feature Selection via Regularization
[ 0.92982456  0.89473684  0.94152047] 
 Mean:  0.922027290448


### Predicting

In [33]:
ames_test.shape

(879, 80)

In [34]:
ames_test.isnull().sum()[ames_test.isnull().sum()> 0]

Lot Frontage      160
Alley             821
Mas Vnr Type        1
Mas Vnr Area        1
Bsmt Qual          25
Bsmt Cond          25
Bsmt Exposure      25
BsmtFin Type 1     25
BsmtFin Type 2     25
Electrical          1
Fireplace Qu      422
Garage Type        44
Garage Yr Blt      45
Garage Finish      45
Garage Qual        45
Garage Cond        45
Pool QC           875
Fence             707
Misc Feature      838
dtype: int64

In [35]:
ames_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
Id                 879 non-null int64
PID                879 non-null int64
MS SubClass        879 non-null int64
MS Zoning          879 non-null object
Lot Frontage       719 non-null float64
Lot Area           879 non-null int64
Street             879 non-null object
Alley              58 non-null object
Lot Shape          879 non-null object
Land Contour       879 non-null object
Utilities          879 non-null object
Lot Config         879 non-null object
Land Slope         879 non-null object
Neighborhood       879 non-null object
Condition 1        879 non-null object
Condition 2        879 non-null object
Bldg Type          879 non-null object
House Style        879 non-null object
Overall Qual       879 non-null int64
Overall Cond       879 non-null int64
Year Built         879 non-null int64
Year Remod/Add     879 non-null int64
Roof Style         879 non-null object
Roof M

In [36]:
g = ames_test.columns.to_series().groupby(ames.dtypes).groups
d = {k.name: v for k, v in g.items()}
d

{'float64': Index(['Lot Frontage', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
        'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area'],
       dtype='object'),
 'int64': Index(['Id', 'PID', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond',
        'Year Built', 'Year Remod/Add', '1st Flr SF', '2nd Flr SF',
        'Low Qual Fin SF', 'Gr Liv Area', 'Full Bath', 'Half Bath',
        'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
        'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
        'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold'],
       dtype='object'),
 'object': Index(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
        'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
        'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
        'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', '

In [37]:
ames_test_num = ames_test[['Lot Frontage', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
        'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath',
        'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Id', 'PID', 'MS SubClass', 'Lot Area', 'Overall Qual', 'Overall Cond',
        'Year Built', 'Year Remod/Add', '1st Flr SF', '2nd Flr SF',
        'Low Qual Fin SF', 'Gr Liv Area', 'Full Bath', 'Half Bath',
        'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
        'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
        'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold']]

In [38]:
ames_test_num.shape

(879, 38)

In [39]:
ames_test_ss = imputer.transform(ames_test_num)
ames_test_ss = pd.DataFrame(ames_test_ss)

In [40]:
ss = StandardScaler()
ss.fit(ames_test_ss)
ames_test_ss = ss.transform(ames_test_ss)

In [41]:
ames_test_ss = pd.DataFrame(ames_test_ss, columns=ames_test_num.columns)

In [42]:
ames_test_ss.shape

(879, 38)

In [43]:
y_hat = logreg.predict(ames_test_ss)

In [44]:
PRED = pd.DataFrame(y_hat, columns=['Sale Condition'])
PRED['Sale Condition'].value_counts()

0    843
1     36
Name: Sale Condition, dtype: int64

In [45]:
ID = ames_test[['Id']]

In [46]:
Prediction = pd.concat([ID, PRED], axis=1)

In [47]:
Prediction.head()

Unnamed: 0,Id,Sale Condition
0,2658,0
1,2718,0
2,2414,0
3,1989,0
4,625,0


In [48]:
y_test.value_counts()

0    486
1     27
Name: Sale Condition, dtype: int64

In [49]:
33/480

0.06875

In [50]:
y_train.value_counts()

0    1433
1     105
Name: Sale Condition, dtype: int64

In [51]:
99/1439

0.06879777623349548

In [52]:
39/840


0.04642857142857143

In [53]:
Prediction.to_csv('predictions_cl.csv', index=False)

In [54]:
logreg = LogisticRegression()
cross_val_score(logreg, X_test, y_test, cv=5).mean()

0.93769018436351392

In [55]:
from sklearn import svm

clf = svm.SVC()
print(cross_val_score(clf, X_train, y_train, cv=5).mean())


clf = svm.SVC(kernel= 'linear')
print(cross_val_score(clf, X_train, y_train, cv=5).mean())



0.931729345573
0.931729345573


In [56]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
clf.score(X_test, y_test)

0.94736842105263153

In [None]:
plt.figure(figsize=(10,10))
sns.pairplot(vis, y_train)

In [69]:
vis = clf.predict(X_train)

In [67]:
y_test.shape

(513,)

In [59]:
y_hat2 = clf.predict(ames_test_ss)

In [None]:
PRED2 = pd.DataFrame(y_hat2, columns=['Sale Condition'])

In [None]:
Prediction2 = pd.concat([ID, PRED2], axis=1)

In [None]:
Prediction2.to_csv('predictions_cl2.csv', index=False)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, p=1)
knn.fit(X_train, y_train)

In [None]:
knn.score(X_train, y_train)

In [None]:
knn.score(X_test, y_test)

In [None]:
knn.predict(ames_test_ss).sum()

In [None]:
knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)
print(knn_gs.best_score_)

In [None]:
knn_gs.predict(X_test).sum()

In [None]:

from sklearn.linear_model import LogisticRegression

lr_params = {
    'penalty':['l1','l2'],
    'C':np.logspace(-4, 2, 40),
    'solver':['liblinear']
}

lr_gs = GridSearchCV(LogisticRegression(), lr_params, cv=5, verbose=1)
lr_gs.fit(X_train, y_train)

In [None]:
best_lr = lr_gs.best_estimator_
print(lr_gs.best_params_)
print(lr_gs.best_score_)

In [None]:
lr_gs.predict(ames_test_ss).sum()

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
Prediction3 = clf.predict(ames_test_ss)

In [None]:
Prediction3 = pd.DataFrame(Prediction3, columns=['Sale Condition'])

In [None]:
Prediction3 = pd.concat([ID, Prediction3], axis=1)

In [None]:
Prediction3.to_csv('prediction_cl3.csv', index=False)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier

In [None]:
clf = DecisionTreeClassifier(max_depth=20)
bagger = BaggingClassifier(clf, n_estimators=10)

print('Decision Tree Score: ', cross_val_score(clf, X_train, y_train, cv=10).mean())
print('Bagging Score: ', cross_val_score(bagger, X_train, y_train, cv=10).mean())

In [None]:
cross_val_score(bagger, X_test, y_test, cv=10).mean()

In [None]:
bagger.score(X_test, y_test)

In [None]:
bagger.predict(ames_test_ss).sum()

In [None]:
bagger