
#  Kaggle Submission

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
np.random.seed(42)

## Load in trained model and scaler

In [6]:
with open('../assets/gs_model.pkl', 'rb') as f:
    gs = pickle.load(f)

with open('../assets/columns.pkl','rb') as f:
    columns = pickle.load(f)

## Load kaggle data

In [7]:
kaggle = pd.read_csv('../datasets/test.csv', index_col='Id')

## Clean data in same method as training data

In [8]:
kaggle['Lot Frontage'] = kaggle.apply(lambda row: round(row['Lot Area']*0.003 +43.332,1)
                                      if np.isnan(row['Lot Frontage']) else row['Lot Frontage'],axis =1)

In [9]:
kaggle['Total_Bath'] = kaggle['Full Bath'] + 0.5 * kaggle['Half Bath']+ kaggle['Bsmt Full Bath'] + 0.5 * kaggle['Bsmt Half Bath']

In [10]:
kaggle.fillna(kaggle[['Fireplace Qu', 'Garage Finish','Garage Qual','Garage Cond',
                    'Garage Type', 'Bsmt Cond', 'Bsmt Qual', 'Bsmt Exposure', 
                    'BsmtFin Type 1', 'BsmtFin Type 2','Mas Vnr Type']].fillna('None'),inplace=True)

kaggle.fillna(kaggle[['Bsmt Half Bath','Bsmt Full Bath','BsmtFin SF 1','BsmtFin SF 2',
                    'Bsmt Unf SF','Total Bsmt SF','Mas Vnr Area','Garage Cars',
                    'Garage Area']].fillna(0),inplace=True)

In [11]:
kaggle['Garage Yr Blt']= kaggle.apply(lambda row: row['Year Built'] 
                                     if np.isnan(row['Garage Yr Blt']) else row['Garage Yr Blt'],axis =1)

In [12]:
kaggle["Total_Sqft"] = kaggle["Gr Liv Area"] + kaggle["Total Bsmt SF"]
kaggle['Lot Area'] = np.log(kaggle['Lot Area'])
kaggle['Lot Frontage'] = np.log(kaggle['Lot Area'])

### There are some categorical features can be converted into ordered numbers when the information shows the ordered relationship. For example, when the categorical feature contain  'po', 'Fa', 'Ex', they can be encoded as 1, 2, 3. 

kaggle = kaggle.replace({"Bsmt Cond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Bsmt Qual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "Exter Cond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Exter Qual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Firep laceQu" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Garage Cond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Garage Qual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Heating QC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Kitchen Qual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Garage Finish" : {"None" : 0, "Unf" : 1, "RFn" : 2, "Fin" : 3}})
Feature = kaggle[['Overall Qual','Total_Sqft','Gr Liv Area','Exter Qual']]


In [13]:
poly = PolynomialFeatures()
Feature = kaggle[['Overall Qual','Total_Sqft','Gr Liv Area','Exter Qual']]
Feature_poly = poly.fit_transform(Feature)
poly_col_names = poly.get_feature_names(['Overall Qual','Total_Sqft','Gr Liv Area','Exter Qual'])
X_TopFea_Poly = pd.DataFrame(Feature_poly, columns=poly_col_names, index=kaggle.index).iloc[:,5:]

kaggle = pd.concat([X_TopFea_Poly,kaggle],axis=1)

In [14]:
kaggle_dummies = pd.get_dummies(kaggle)

In [15]:
diff = (columns).difference(kaggle_dummies.columns)

In [16]:
for x in diff:
    kaggle_dummies[x] = 0

In [17]:
kaggle = kaggle_dummies[columns]

## Make predictions

In [19]:
preds = gs.predict(kaggle)

## Align predictions back with index and set up header

In [20]:
submission = pd.DataFrame(preds, index=kaggle.index, columns=['SalePrice'])

## Sort index (required for proper submission)

In [22]:
submission.sort_index(inplace=True)

## Save to csv to submit

In [24]:
submission.to_csv('../datasets/ridge_kaggle.csv')

## Use command line `head` to check data is correct format

In [25]:
!head ../datasets/ridge_kaggle.csv

Id,SalePrice
2,116986.2793339451
4,261049.48629398775
6,188714.11628508594
7,213538.1267971736
17,191693.2130128413
18,344220.8137857348
22,193218.97558871322
27,121464.23490429565
31,86436.7802346754
