In [1]:
from sklearn.linear_model    import LinearRegression, LassoCV
from sklearn.metrics         import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

import pandas as pd
import numpy  as np

In [2]:
kaggle = pd.read_csv('./Project_2_DataSet/test.csv')
train  = pd.read_csv('./Project_2_DataSet/train.csv')

## isolate numerical features and drop `'Id', 'PID', 'SalePrice'`

In [3]:
num_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis = 1).columns)

## make sure column names for kaggle and train set match

In [4]:
set(train.columns) ^ set(kaggle.columns)

{'SalePrice'}

## fill numerical `null` values with `-999`

In [5]:
for val in num_features:
    train[val]  = train[val].fillna(-999)
    kaggle[val] = kaggle[val].fillna(-999)

In [6]:
kaggle.shape

(879, 80)

In [7]:
train.shape

(2051, 81)

## isolate categorical features

In [8]:
cat_cols = list(train.select_dtypes(include='object').columns)

## - pass `'N/A'` to my `null` values in my categorical columns
## - sort categorical columns in both data sets to match

In [9]:
for col in cat_cols:
    kaggle[col]  = kaggle[col].fillna('N/A')
    train[col]   = train[col].fillna('N/A')
    
    train_values = sorted(list(train[col].unique()))
    test_values  = sorted(list(kaggle[col].unique()))
    
    categories   = set(train_values + test_values) # get rid of 'SalePrice' col so that cols match
    
    kaggle[col]  = pd.Categorical(kaggle[col], categories=categories)
    train[col]   = pd.Categorical(train[col], categories=categories)

In [10]:
train_dummies = pd.get_dummies(train[cat_cols])
test_dummies  = pd.get_dummies(kaggle[cat_cols])

## merge numerical and categorical columns, and set my `X` and `y` values

In [11]:
X        = pd.concat([train_dummies, train[num_features]], axis=1)
X_kaggle = pd.concat([test_dummies, kaggle[num_features]], axis=1)

y        = train['SalePrice']

In [12]:
X.shape

(2051, 314)

## make sure my columns in both `train` and `test` sets match

In [13]:
set(X.columns) ^ set(X_kaggle.columns)

set()

## LassoCV Pipeline

In [14]:
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, train_test_split

#### Use Polynomial Feature Engineering to Create New Features, Scale Data to Decrease Magnitude, Use LASSO to 0 out Unecessary New Features

In [None]:
steps = [
    
    ('pf', PolynomialFeatures()),
    ('ss', StandardScaler()),
    ('lc', LassoCV(n_jobs=-1, max_iter=10000,verbose=2))
]

pipe        = Pipeline(steps)

grid_params = {}

gs = GridSearchCV(pipe, param_grid=grid_params, verbose=2)
gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

## get predictions 

In [None]:
pred = gs.predict(X_kaggle)

In [None]:
kaggle['SalePrice'] = pred

## format predictions into dataframe alongside `Id` column, 
## and export as `.csv`

In [None]:
import datetime
now = str(datetime.datetime.now())

In [None]:
f'predictions_{now}'

In [None]:
kaggle[['Id', 'SalePrice']].to_csv(f'predictions_{now}', index=False)

In [15]:
#pd.read_csv('predictions_.csv')