In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score

%matplotlib inline

In [2]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

train = train[(train['Gr Liv Area']<4000) & (train['Lot Area']<25000)]

In [3]:
Nominal = ['Sale Type','Neighborhood','Bldg Type', 'House Style','Foundation'] # no sale condition from jse site
# Bldg Type & Sale Type
# split bldg type - 1Fam, TwnhmE, Else
# split Sale Type - New, Oth&CWD, Else
# I want to use neighborhood, but will need extra time to group based on neighborhood prices (pivots)
Numerical = ['Lot Area','Year Built', 'Year Remod/Add', 
             'BsmtFin SF 1','BsmtFin SF 2','Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF',
             '2nd Flr SF','Gr Liv Area', 'Bsmt Full Bath',
             'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
             'Kitchen AbvGr','TotRms AbvGrd',
             'Garage Area','Wood Deck SF', 'Open Porch SF','Screen Porch',
             'Mo Sold', 'Yr Sold' ,'SalePrice']
Ordinal = ['Overall Qual','Overall Cond','Paved Drive']
# only use overall 'overall qual'
Target = ['SalePrice']
Ids = ['Id']

### examples of stuff

In [4]:
'straw' in 'strawberry jam'

True

In [5]:
if 'straw' in ['strawberry', 'jam']:
    print(True)
else:
    print(False)

False


In [6]:
def bin_quality(val):
    if val < 5:
        return 0
    else:
        return 1
    
#train['new+colname'].apply

#train['new_col'] =
train['Overall Qual'].apply(bin_quality)
# train['Overall Qual'].value_counts()

# plt.scatter(train['Overall Qual'].apply(bin_quality),train['SalePrice'],color='pink');

def sale_type_cat(val):
    if val in ['New']:
        return 1
    elif val in ['Oth','CWD']:
        return 2
    else:
        return 0
        
# train['Sale Type'].apply(sale_type_cat)

### make things work

In [7]:
final_train = train[['Id','Lot Area', 'Year Built', 'Year Remod/Add', 'Gr Liv Area',
                  'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath',
                  'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 
                   'Wood Deck SF','Open Porch SF','Screen Porch',
                   'Mo Sold', 'Yr Sold' ,'Overall Cond', 'Bldg Type','Sale Type','SalePrice']]

final_train['Tot_bath_abv_grd'] = final_train['Full Bath']+0.5*final_train['Half Bath']
final_train['Tot_bath_bsmt']=final_train['Bsmt Full Bath']+0.5*final_train['Bsmt Half Bath']
final_train['Outdoor Liv Area']=final_train['Wood Deck SF']+final_train['Open Porch SF']+final_train['Screen Porch']

final_train['Overall Cond Bi']=final_train['Overall Cond'].map(lambda x: 1 if x >= 5 else 0)
final_train['Bldg Type Bi']=final_train['Bldg Type'].map(lambda x: 1 if x in ['1Fam','TwnhmE'] else 0)
final_train['Sale Type Bi']=final_train['Sale Type'].apply(sale_type_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

In [8]:
final_train.head(2)

Unnamed: 0,Id,Lot Area,Year Built,Year Remod/Add,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,...,Overall Cond,Bldg Type,Sale Type,SalePrice,Tot_bath_abv_grd,Tot_bath_bsmt,Outdoor Liv Area,Overall Cond Bi,Bldg Type Bi,Sale Type Bi
0,109,13517,1976,2005,1479,0.0,0.0,2,1,3,...,8,1Fam,WD,130500,2.5,0.0,44,1,1,0
1,544,11492,1996,1997,2122,1.0,0.0,2,1,4,...,5,1Fam,WD,220000,2.5,1.0,74,1,1,0


In [9]:
final_features = ['Id', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Gr Liv Area',
             'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Mo Sold', 'Yr Sold',
             'Outdoor Liv Area',  'Tot_bath_abv_grd', 'Tot_bath_bsmt','Overall Cond Bi',
             'Bldg Type Bi','Sale Type Bi','SalePrice']
final_feats_train = final_train[final_features]

In [10]:
final_feats_train.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
final_feats_train.shape

(2020, 17)

### Try following steps 
 * test/train split 
 * polynomial features
 * standard scaler
 * fit Lasso/Ridge

In [12]:
mod_feats = ['Lot Area', 'Year Built', 'Year Remod/Add', 'Gr Liv Area',
             'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Mo Sold', 'Yr Sold',
             'Outdoor Liv Area',  'Tot_bath_abv_grd', 'Tot_bath_bsmt','Overall Cond Bi',
             'Bldg Type Bi','Sale Type Bi']
X = final_feats_train[mod_feats]
y = final_feats_train['SalePrice']

In [13]:
pf = PolynomialFeatures(degree = 3, include_bias=False)
pf = pf.fit(X)
X_poly = pf.transform(X)

In [14]:
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly,y,test_size = 0.2, random_state=42)

In [15]:
ss = StandardScaler()
ss.fit(X_poly_train)

X_poly_train_ss = ss.transform(X_poly_train)
X_poly_test_ss = ss.transform(X_poly_test)

In [16]:
y_log_train = np.log(y_poly_train)

In [17]:
lasso = LassoCV(max_iter=10000)
ridge = RidgeCV()

In [18]:
print(cross_val_score(lasso, X_poly_train_ss,y_log_train, cv = 5))
print(cross_val_score(lasso, X_poly_train_ss,y_log_train,cv = 5).mean())

# print(cross_val_score(ridge, X_poly_train_ss,y_log_train, cv = 5))
# print(cross_val_score(ridge, X_poly_train_ss,y_log_train,cv = 5).mean())



[0.77357225 0.76719432 0.85115802 0.84069183 0.86440367]




0.8194040171851839


In [19]:
lasso.fit(X_poly_train_ss,y_log_train)

print(lasso.score(X_poly_train_ss,y_log_train))

lasso_pred = lasso.predict(X_poly_test_ss)
lasso_pred = np.exp(lasso_pred)
r2_score(y_poly_test,lasso_pred)



0.8382858620143523


0.8517652570001318

In [20]:
ridge.fit(X_poly_train_ss,y_log_train)

print(ridge.score(X_poly_train_ss,y_log_train))

ridge_pred = ridge.predict(X_poly_test_ss)
ridge_pred = np.exp(ridge_pred)
r2_score(y_poly_test,ridge_pred)

0.8635714228091043


0.8289077609070893

Somehow this is really taxing on the computer so will not try for this iteration

```python
enet_alphas = np.arange(0.01, 5.0, 0.005)

# Set up our l1 ratio. (What does this do?) 
# enet_ratio = np.arange(0.01, 1.0, 0.01)  
# this will weight lasso more heavily as ratio -> 1

# Instantiate model.
enet_model = ElasticNetCV(l1_ratio=enet_ratio, cv=5, max_iter=10000)

# Fit model using optimal alpha.
enet_model = enet_model.fit(X_poly_train_ss, y_log_train)

print(enet_model.score(X_poly_train_ss,y_log_train))

enet_pred = enet_model.predict(X_poly_test_ss)
enet_pred = np.exp(enet_pred)
r2_score(y_poly_test,enet_pred)
```

### now to fit on testing data and create predictions
* create all transformations
* create all polynomials
* scale 
* predict
* save predictions 
* submit

In [21]:
final_test = test[['Id','Lot Area', 'Year Built', 'Year Remod/Add', 'Gr Liv Area',
                  'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath',
                  'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 
                   'Wood Deck SF','Open Porch SF','Screen Porch',
                   'Mo Sold', 'Yr Sold' ,'Overall Cond', 'Bldg Type','Sale Type']]

final_test['Tot_bath_abv_grd'] = final_test['Full Bath']+0.5*final_test['Half Bath']
final_test['Tot_bath_bsmt']=final_test['Bsmt Full Bath']+0.5*final_test['Bsmt Half Bath']
final_test['Outdoor Liv Area']=final_test['Wood Deck SF']+final_test['Open Porch SF']+final_test['Screen Porch']

final_test['Overall Cond Bi']=final_test['Overall Cond'].map(lambda x: 1 if x >= 5 else 0)
final_test['Bldg Type Bi']=final_test['Bldg Type'].map(lambda x: 1 if x in ['1Fam','TwnhmE'] else 0)
final_test['Sale Type Bi']=final_test['Sale Type'].apply(sale_type_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

In [22]:
final_test_mod = final_test[mod_feats]

In [23]:
final_test_ss = ss.transform(pf.transform(final_test_mod))

In [24]:
final_pred = lasso.predict(final_test_ss)
final_pred = np.exp(final_pred)

In [25]:
pd.DataFrame({
    'Id': final_test['Id'],
    'SalePrice': final_pred
})

Unnamed: 0,Id,SalePrice
0,2658,114032.766682
1,2718,118558.938369
2,2414,253460.226472
3,1989,109958.359027
4,625,157112.422560
5,333,107715.316832
6,1327,132892.217712
7,858,152692.721197
8,95,189754.493445
9,1568,157557.692246


In [27]:
pd.DataFrame({
    'Id': final_test['Id'],
    'SalePrice': final_pred
}).to_csv('./Submissions/katy_chow_submission2_housingproject2_20181201.csv', index = False)