In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('datasets/train.csv', index_col = 'PID')
kaggle_test = pd.read_csv('datasets/test.csv', index_col = 'PID')

In [3]:
df.drop(columns = ['Lot Frontage', 'Garage Yr Blt'], inplace = True)
kaggle_test.drop(columns = ['Lot Frontage', 'Garage Yr Blt'], inplace = True)

In [4]:
dum_locations = pd.get_dummies(df, columns = ['Neighborhood', 'Condition 1', 'Condition 2'], drop_first = True)

In [22]:
dum_locations.head(3)

Unnamed: 0_level_0,Id,MS SubClass,MS Zoning,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Condition 1_RRAn,Condition 1_RRNe,Condition 1_RRNn,Condition 2_Feedr,Condition 2_Norm,Condition 2_PosA,Condition 2_PosN,Condition 2_RRAe,Condition 2_RRAn,Condition 2_RRNn
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
533352170,109,60,RL,13517,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,1,0,0,0,0,0
531379050,544,60,RL,11492,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,1,0,0,0,0,0
535304180,153,20,RL,7922,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,1,0,0,0,0,0


In [23]:
df = dum_locations._get_numeric_data()
# kaggle_test= pd.concat([kaggle_locs,
#             kaggle_test._get_numeric_data()],
#             axis =1)

In [24]:
df.fillna(0, inplace = True)
# kaggle_test.fillna(0, inplace = True)

In [25]:
# df.dropna(inplace = True)
# kaggle_test.dropna(inplace = True)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2051 entries, 533352170 to 527162130
Data columns (total 78 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    2051 non-null   int64  
 1   MS SubClass           2051 non-null   int64  
 2   Lot Area              2051 non-null   int64  
 3   Overall Qual          2051 non-null   int64  
 4   Overall Cond          2051 non-null   int64  
 5   Year Built            2051 non-null   int64  
 6   Year Remod/Add        2051 non-null   int64  
 7   Mas Vnr Area          2051 non-null   float64
 8   BsmtFin SF 1          2051 non-null   float64
 9   BsmtFin SF 2          2051 non-null   float64
 10  Bsmt Unf SF           2051 non-null   float64
 11  Total Bsmt SF         2051 non-null   float64
 12  1st Flr SF            2051 non-null   int64  
 13  2nd Flr SF            2051 non-null   int64  
 14  Low Qual Fin SF       2051 non-null   int64  
 15  Gr Liv A

In [9]:
# kaggle_test = kaggle_test[kaggle_test['Id'] != 0.0]

In [10]:
# kaggle_test.shape

In [27]:
X = df.drop(columns = ['SalePrice'])
y = df['SalePrice']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Without a pipeline:
 - Scale
 - fit & evaluate

In [29]:
# ss = StandardScaler()
# ss.fit(X_train)

# Z_train = ss.transform(X_train)
# Z_test = ss.transform(X_test)

In [14]:
alphas = np.linspace(200, 700, num = 100)

In [15]:
# lcv = LassoCV(alphas = alphas)
# lcv.fit(Z_train, y_train)

In [16]:
# lcv.score(Z_train, y_train), lcv.score(Z_test, y_test)

In [17]:
# lcv_preds = lcv.predict(Z_test)

In [18]:
# lcv_resids = y_test - lcv_preds

In [19]:
# lcv.alpha_

In [20]:
# lcv.alphas_[:5]

### With a pipeline:
 - Scale
 - Fit & Evaluate LASSO Model

In [50]:
pipe = Pipeline([
    
    # ('ohe', OneHotEncoder(drop:'first')),
    # ('poly', PolynomialFeatures()),
    ('vt', VarianceThreshold()),
    ('ss', StandardScaler()),
    ('lcv', LassoCV())
])

In [51]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vt', VarianceThreshold(threshold=0.0)),
                ('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lcv',
                 LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                         fit_intercept=True, max_iter=1000, n_alphas=100,
                         n_jobs=None, normalize=False, positive=False,
                         precompute='auto', random_state=None,
                         selection='cyclic', tol=0.0001, verbose=False))],
         verbose=False)

In [48]:
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8201641511908436, 0.8577143600984481)

In [45]:
lcv_pipe_preds = pipe.predict(X_test)

In [14]:
# kaggle_preds = pipe.predict(kaggle_test)

In [61]:
# kaggle_test_preds_df = pd.DataFrame({'Id' : kaggle_test['Id'],
#                                      'SalePrice' : kaggle_preds
#                                     })

In [62]:
# submission = pd.DataFrame({
#         'Id' : kaggle_test['Id'].astype(int),
#         'SalePrice' : kaggle_preds
# })

In [63]:
# submission.shape

(878, 2)

In [64]:
# submission.to_csv('submission/pipe_trial.csv', index = False)