In [1]:
## import data
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv").set_index("Id")
test = pd.read_csv("test.csv").set_index("Id")
combine = pd.concat([train, test])

# seperate based on neighborhoood median SalesPrice
worst_neighbor_df = combine[combine.Neighborhood.isin(['MeadowV', 'IDOTRR', 'BrDale', 'OldTown','Edwards', 'BrkSide', 'Sawyer', 'Blueste'])]
med_neighbor_df = combine[combine.Neighborhood.isin(['SWISU', 'NAmes', 'NPkVill', 'Mitchel', 'SawyerW', 'Gilbert', 'NWAmes', 'Blmngtn'])]
best_neighbor_df = combine[combine.Neighborhood.isin(['CollgCr', 'ClearCr', 'Crawfor', 'Veenker', 'Somerst', 'Timber', 'StoneBr', 'NoRidge', 'NridgHt'])]


In [2]:
print(worst_neighbor_df.shape)
print(med_neighbor_df.shape)
print(best_neighbor_df.shape)
862+1077+980

(862, 80)
(1077, 80)
(980, 80)


2919

In [3]:
# process data before model fitting
from preprocessfinal import impute
onehot_worst, encodedDic = impute(worst_neighbor_df, True) # process data and onehot encode
onehot_med, encodedDic = impute(med_neighbor_df, True) # process data and onehot encode
onehot_best, encodedDic = impute(best_neighbor_df, True) # process data and onehot encode

# seperate onehot data into train and test
train_onehot_worst = onehot_worst.iloc[onehot_worst.index < min(test.index),]
test_onehot_worst = onehot_worst.iloc[onehot_worst.index >= min(test.index),].drop('SalePrice', axis = 1) # salesprice col were all NA 

train_onehot_med = onehot_med.iloc[onehot_med.index < min(test.index),]
test_onehot_med = onehot_med.iloc[onehot_med.index >= min(test.index),].drop('SalePrice', axis = 1) # salesprice col were all NA 

train_onehot_best = onehot_best.iloc[onehot_best.index < min(test.index),]
test_onehot_best = onehot_best.iloc[onehot_best.index >= min(test.index),].drop('SalePrice', axis = 1) # salesprice col were all NA 

print(train_onehot_worst.shape)
print(test_onehot_worst.shape)
print(train_onehot_med.shape)
print(test_onehot_med.shape)
print(train_onehot_best.shape)
print(test_onehot_best.shape)
print(417+445+536+541+507+473) 

#split train data frame into x var and y var for model testing
x_onehot_worst = train_onehot_worst.drop('SalePrice', axis=1)
x_onehot_med = train_onehot_med.drop('SalePrice', axis=1)
x_onehot_best = train_onehot_best.drop('SalePrice', axis=1)

y_log = pd.DataFrame(np.log(train.SalePrice))
y_log_worst = y_log.iloc[y_log.index.isin(train_onehot_worst.index),] # convert y to normal distribution for regression models
y_log_med = y_log.iloc[y_log.index.isin(train_onehot_med.index),] # convert y to normal distribution for regression models
y_log_best = y_log.iloc[y_log.index.isin(train_onehot_best.index),] # convert y to normal distribution for regression models

print(y_log_worst.shape)
print(y_log_med.shape)
print(y_log_best.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-doc

(417, 229)
(445, 228)
(536, 225)
(541, 224)
(507, 217)
(473, 216)
2919
(417, 1)
(536, 1)
(507, 1)


In [4]:
from sklearn.model_selection import GridSearchCV # search for the best lambda
from sklearn import linear_model

lasso_worst = linear_model.Lasso(normalize=True) # create a ridge regression instance

# find the best alpha (lambda) for lasso 
grid_param = [{'alpha': np.logspace(-4.5, 2, 100)}]
para_search_lasso_worst = GridSearchCV(estimator=lasso_worst, param_grid=grid_param, scoring='neg_mean_squared_error', cv=10, return_train_score=True)
para_search_lasso_worst.fit(x_onehot_worst, y_log_worst)

print(para_search_lasso_worst.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(para_search_lasso_worst.best_score_)))

{'alpha': 0.0011905772393787845}
Lowest RMSE found:  0.18540119029199814


In [5]:
lasso_med = linear_model.Lasso(normalize=True) # create a ridge regression instance

# find the best alpha (lambda) for lasso 
grid_param = [{'alpha': np.logspace(-4.5, 2, 100)}]
para_search_lasso_med = GridSearchCV(estimator=lasso_med, param_grid=grid_param, scoring='neg_mean_squared_error', cv=10, return_train_score=True)
para_search_lasso_med.fit(x_onehot_med, y_log_med)

print(para_search_lasso_med.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(para_search_lasso_med.best_score_)))

{'alpha': 0.00022570197196339215}
Lowest RMSE found:  0.09904998644870235


In [6]:
lasso_best = linear_model.Lasso(normalize=True) # create a ridge regression instance

# find the best alpha (lambda) for lasso 
grid_param = [{'alpha': np.logspace(-4.5, 2, 100)}]
para_search_lasso_best = GridSearchCV(estimator=lasso_best, param_grid=grid_param, scoring='neg_mean_squared_error', cv=10, return_train_score=True)
para_search_lasso_best.fit(x_onehot_best, y_log_best)

print(para_search_lasso_best.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(para_search_lasso_best.best_score_)))


{'alpha': 0.00022570197196339215}
Lowest RMSE found:  0.10183032494655822


In [42]:
from sklearn.metrics import mean_squared_error
# fit best ridge equation to all train data 
lasso_y_worst = para_search_lasso_worst.best_estimator_.predict(x_onehot_worst)
print("RMSE: ", np.sqrt(np.mean((y_log_worst.values-lasso_y_worst)**2)))
print("RMSE: ", np.sqrt(mean_squared_error(y_log_worst.values, lasso_y_worst)))

lasso_y_med = para_search_lasso_med.best_estimator_.predict(x_onehot_med)
print("RMSE: ", np.sqrt(np.mean((y_log_med.values-lasso_y_med)**2)))
print("RMSE: ", mean_squared_error(y_log_med.values, lasso_y_med))

lasso_y_best = para_search_lasso_best.best_estimator_.predict(x_onehot_best)
print("RMSE: ", np.sqrt(np.mean((y_log_best.values-lasso_y_best)**2)))
print("RMSE: ", mean_squared_error(y_log_best.values, lasso_y_best))


RMSE:  0.38880211110670043
RMSE:  0.16594978199486426
RMSE:  0.3355265626860251
RMSE:  0.006975697521277645
RMSE:  0.45634835784886807
RMSE:  0.007200614006508818


In [32]:
lasso_worst_test_y = para_search_lasso_worst.best_estimator_.predict(test_onehot_worst)
lasso_med_test_y = para_search_lasso_med.best_estimator_.predict(test_onehot_med)
lasso_best_test_y = para_search_lasso_best.best_estimator_.predict(test_onehot_best)

#convert predicted y to datafram to combine later on for submission
worst = pd.DataFrame(lasso_worst_test_y).set_index(test_onehot_worst.index)
med = pd.DataFrame(lasso_med_test_y).set_index(test_onehot_med.index)
best = pd.DataFrame(lasso_best_test_y).set_index(test_onehot_best.index)
lasso_neighbor_pred_y = pd.concat([worst,med,best])
lasso_neighbor_pred_y = np.expm1(lasso_neighbor_pred_y)

In [34]:
lasso_neighbor_pred_y.to_csv('(16) lasso_neighbor_submission.csv')

In [33]:
lasso_neighbor_pred_y

Unnamed: 0_level_0,0
Id,Unnamed: 1_level_1
1472,101682.045566
1473,98519.249636
1493,162246.588738
1531,116348.148622
1532,102429.478817
1533,137735.431149
1534,125174.763697
1535,142412.814113
1536,116916.713488
1537,72635.111573
