In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from scipy.stats import chi2_contingency
import time

In [3]:
ames = pd.read_csv('datasets/train.csv')
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [7]:
x = ames['MS Zoning']

In [8]:
y = ames['Street']

In [12]:
cm = pd.crosstab(x,y)
cm

Street,Grvl,Pave
MS Zoning,Unnamed: 1_level_1,Unnamed: 2_level_1
A (agr),0,2
C (all),3,16
FV,0,101
I (all),1,0
RH,0,14
RL,2,1596
RM,1,315


In [13]:
chi2 = chi2_contingency(cm)
chi2

(427.91055121232523,
 2.7802256790878933e-89,
 6,
 array([[6.82593857e-03, 1.99317406e+00],
        [6.48464164e-02, 1.89351536e+01],
        [3.44709898e-01, 1.00655290e+02],
        [3.41296928e-03, 9.96587031e-01],
        [4.77815700e-02, 1.39522184e+01],
        [5.45392491e+00, 1.59254608e+03],
        [1.07849829e+00, 3.14921502e+02]]))

In [15]:
chi2[0]

427.91055121232523

In [26]:
n = cm.sum().sum()
n

2051

In [20]:
phi2 = chi2[0]/n
phi2

0.2086350810396515

In [21]:
cm.shape

(7, 2)

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
def cramers_V(var1,var2) :
    crosstab = np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
    return (stat/(obs*mini))

| Model No. | Parameters                                                                                                                                                                                                                                                                                                                         | Selected Features                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Validation                                                                                             | Kaggle Result                                                            |
|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
| 1         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = False<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = False<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = False              | **30 Selected Features are:**<br>['Overall Qual', 'Neighborhood', 'Exter Qual', 'Gr Liv Area',<br>       'Kitchen Qual', 'Bsmt Qual', 'Garage Area', 'Garage Cars',<br>       'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Garage Finish',<br>       'Year Remod/Add', 'Fireplace Qu', 'Full Bath', 'Foundation',<br>       'TotRms AbvGrd', 'Garage Type', 'Mas Vnr Area', 'MS SubClass',<br>       'Fireplaces', 'Heating QC', 'Mas Vnr Type', 'Bsmt Exposure',<br>       'BsmtFin SF 1', 'Exterior 1st', 'BsmtFin Type 1', 'Exterior 2nd',<br>       'Sale Type', 'Open Porch SF']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | **Best model:**<br>- Lasso(alpha=1018.9)<br><br>**Validation set RMSE:**<br>- Validation set: 36,959.6 | **Actual RMSE from Kaggle**<br>- Private: 32,865.0<br>- Public: 32,689.3 |
| 2         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = False<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = False<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = **True**           | **30 Selected Features are:**<br>['Neighborhood Gr Liv Area', 'Overall Qual Gr Liv Area',<br>       'Overall Qual^2', 'Exter Qual Gr Liv Area',<br>       'Neighborhood 1st Flr SF', 'Overall Qual Garage Cars',<br>       'Gr Liv Area Kitchen Qual', 'Gr Liv Area Garage Cars',<br>       'Overall Qual 1st Flr SF', '1st Flr SF Kitchen Qual',<br>       'Total Bsmt SF Garage Cars', 'Neighborhood Fireplace Qu',<br>       'Neighborhood Garage Finish', 'Full Bath Kitchen Qual',<br>       'Overall Qual Full Bath', 'Foundation Gr Liv Area',<br>       'Exter Qual Fireplace Qu', 'MS Zoning Overall Qual',<br>       'Exter Qual Garage Type', 'Kitchen Qual Garage Type',<br>       'Exter Qual Full Bath', 'Heating QC Gr Liv Area',<br>       'Garage Type Garage Area', 'Kitchen Qual Garage Finish',<br>       'Gr Liv Area Garage Type', 'Overall Qual Garage Qual',<br>       'Fireplace Qu Garage Area', 'Gr Liv Area Garage Finish',<br>       'Overall Qual Garage Type', 'MS SubClass Kitchen Qual']                                                                                                                                | **Best model:**<br>- Lasso(alpha=1010.5)<br><br>**Validation set RMSE:**<br>- Validation set: 33,178.6 | **Actual RMSE from Kaggle**<br>- Private: 30,041.4<br>- Public: 28,955.5 |
| 3         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = **True**<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = False<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = False           | **30 Selected Features are:**<br>['Overall Qual', 'Neighborhood', 'Exter Qual', 'Gr Liv Area_root',<br>       'Kitchen Qual', 'Bsmt Qual', 'Garage Area', 'Garage Cars',<br>       'Total Bsmt SF', '1st Flr SF_root', 'Year Built', 'Garage Finish',<br>       'Year Remod/Add', 'Fireplace Qu', 'Full Bath', 'Foundation',<br>       'TotRms AbvGrd', 'Garage Type', 'Mas Vnr Area', 'Fireplaces_root',<br>       'MS SubClass', 'Heating QC', 'Mas Vnr Type', 'Open Porch SF_root',<br>       'Bsmt Exposure', 'BsmtFin SF 1', 'Exterior 1st', 'BsmtFin Type 1',<br>       'Exterior 2nd', 'Sale Type']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | **Best model:**<br>- Lasso(alpha=245.5)<br><br>**Validation set RMSE:**<br>- Validation set: 35,426.6  | **Actual RMSE from Kaggle**<br>- Private: 33,145.0<br>- Public: 33,773.1 |
| 4         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = **True**<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = False<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = **True**        | **30 Selected Features are:**<br>['Overall Qual Gr Liv Area_root', 'Neighborhood Gr Liv Area',<br>       'Overall Qual 1st Flr SF_root', 'Exter Qual Gr Liv Area',<br>       'Neighborhood 1st Flr SF', 'Overall Qual Garage Cars',<br>       'Gr Liv Area Kitchen Qual', 'Gr Liv Area Garage Cars',<br>       'Bsmt Qual 1st Flr SF_root', 'Overall Qual Lot Frontage_root',<br>       'Overall Qual Garage Type_root', 'Exter Qual Fireplace Qu_root',<br>       'Total Bsmt SF Garage Cars', 'Overall Qual Sale Type_root',<br>       'Overall Qual Lot Area_root', 'Kitchen Qual Fireplace Qu_root',<br>       'Overall Qual Fireplace Qu_root',<br>       'Overall Qual Garage Finish_root', 'Exter Qual Lot Area_root',<br>       'Garage Area Fireplace Qu_root', 'Overall Qual MS SubClass_root',<br>       'Neighborhood Bsmt Exposure_root', 'Garage Type Garage Area',<br>       'Gr Liv Area MS SubClass_root', 'Overall Qual Mas Vnr Type_root',<br>       'Gr Liv Area Mas Vnr Type_root', 'Kitchen Qual Lot Area_root',<br>       'Gr Liv Area Bsmt Exposure_root', 'Gr Liv Area Fireplace Qu_root',<br>       'MS SubClass Garage Area'] | **Best model:**<br>- Lasso(alpha=871.2)<br><br>**Validation set RMSE:**<br>- Validation set: 33,178.2  | **Actual RMSE from Kaggle**<br>- Private: 29,442.0<br>- Public: 28,684.8 |
| 5         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = **True**<br>root_after_pf = **True**<br><br>*Log terms*<br>log_variable = False<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = **True**     | **30 Selected Features are:**<br>['Neighborhood Gr Liv Area', 'Overall Qual Gr Liv Area',<br>       'Overall Qual^2', 'Exter Qual Gr Liv Area',<br>       'Neighborhood 1st Flr SF', 'Overall Qual Garage Cars',<br>       'Gr Liv Area Kitchen Qual', 'Gr Liv Area Garage Cars',<br>       'Overall Qual 1st Flr SF', '1st Flr SF Kitchen Qual',<br>       'Total Bsmt SF Garage Cars', 'Neighborhood Fireplace Qu',<br>       'Neighborhood Garage Finish', 'Full Bath Kitchen Qual',<br>       'Overall Qual Full Bath', 'Foundation Gr Liv Area',<br>       'Exter Qual Fireplace Qu', 'MS Zoning Overall Qual',<br>       'Exter Qual Garage Type', 'Kitchen Qual Garage Type',<br>       'Exter Qual Full Bath', 'Heating QC Gr Liv Area',<br>       'Garage Type Garage Area', 'Kitchen Qual Garage Finish',<br>       'Gr Liv Area Garage Type', 'Overall Qual Garage Qual',<br>       'Fireplace Qu Garage Area', 'Gr Liv Area Garage Finish',<br>       'Overall Qual Garage Type', 'MS SubClass Kitchen Qual']                                                                                                                                | **Best model:**<br>- Lasso(alpha=1010.6)<br><br>**Validation set RMSE:**<br>- Validation set: 33,178.6 | **Actual RMSE from Kaggle**<br>- Private: 30,041.4<br>- Public: 28,955.5 |
| 6         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = False<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = **True**<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = False           | **30 Selected Features are:**<br>['Overall Qual', 'Neighborhood', 'Exter Qual', 'Gr Liv Area',<br>       'Kitchen Qual', 'Bsmt Qual', 'Garage Area', 'Garage Cars',<br>       'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Garage Finish',<br>       'Year Remod/Add', 'Fireplace Qu', 'Full Bath', 'Foundation',<br>       'TotRms AbvGrd', 'Garage Type', 'Mas Vnr Area', 'MS SubClass',<br>       'Fireplaces', 'Heating QC', 'Mas Vnr Type', 'Bsmt Exposure',<br>       'BsmtFin SF 1', 'Exterior 1st', 'BsmtFin Type 1', 'Exterior 2nd',<br>       'Sale Type', 'Lot Area_log']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | **Best model:**<br>- Ridge(alpha=91.2)<br><br>**Validation set RMSE:**<br>- Validation set: 36,094.0   | **Actual RMSE from Kaggle**<br>- Private: 32,645.5<br>- Public: 32,660.6 |
| 7         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = False<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = **True**<br>log_after_pf = False<br><br>*Polynomial features*<br>poly_on = **True**        | **30 Selected Features are:**<br>['Neighborhood Gr Liv Area', 'Overall Qual Lot Area_log',<br>       'Overall Qual Gr Liv Area', 'Exter Qual Gr Liv Area',<br>       'Neighborhood 1st Flr SF', 'Overall Qual Garage Cars',<br>       'Gr Liv Area Kitchen Qual', 'Gr Liv Area Garage Cars',<br>       'Overall Qual 1st Flr SF', '1st Flr SF Kitchen Qual',<br>       'Total Bsmt SF Garage Cars', 'Neighborhood Fireplace Qu',<br>       'Neighborhood Garage Finish', 'Bsmt Qual TotRms AbvGrd_log',<br>       'Overall Qual Full Bath', 'Foundation Gr Liv Area',<br>       'Exter Qual Fireplace Qu', 'MS Zoning Overall Qual',<br>       'Exter Qual Garage Type', 'Kitchen Qual Garage Type',<br>       'Heating QC Gr Liv Area', 'Garage Type Garage Area',<br>       'Kitchen Qual Garage Finish', 'Gr Liv Area Garage Type',<br>       '1st Flr SF_log Gr Liv Area_log', 'Overall Qual Garage Qual',<br>       'Fireplace Qu Garage Area', 'Gr Liv Area Garage Finish',<br>       'Overall Qual Garage Type', 'MS SubClass Kitchen Qual']                                                                                                      | **Best model:**<br>- Lasso(alpha=1288.5)<br><br>**Validation set RMSE:**<br>- Validation set: 32,563.2 | **Actual RMSE from Kaggle**<br>- Private: 30,073.7<br>- Public: 29,436.8 |
| 8         | **Data Cleaning**<br>maj_threshold = 90%<br>mul_threshold_init = 90%<br>mul_threshold_final = 90%<br><br>**Additional Features**<br>*Root terms*<br>root_variable = False<br>root_after_pf = False<br><br>*Log terms*<br>log_variable = **True**<br>log_after_pf = **True**<br><br>*Polynomial features*<br>poly_on = <br>**True** | **30 Selected Features are:**<br>['Neighborhood Gr Liv Area', 'Overall Qual Gr Liv Area',<br>       'Overall Qual^2', 'Exter Qual Gr Liv Area',<br>       'Neighborhood 1st Flr SF', 'Overall Qual Garage Cars',<br>       'Gr Liv Area Kitchen Qual', 'Gr Liv Area Garage Cars',<br>       'Overall Qual 1st Flr SF', '1st Flr SF Kitchen Qual',<br>       'Total Bsmt SF Garage Cars', 'Neighborhood Fireplace Qu',<br>       'Neighborhood Garage Finish', 'Full Bath Kitchen Qual',<br>       'Overall Qual Full Bath', 'Foundation Gr Liv Area',<br>       'Exter Qual Fireplace Qu', 'MS Zoning Overall Qual',<br>       'Exter Qual Garage Type', 'Kitchen Qual Garage Type',<br>       'Exter Qual Full Bath', 'Heating QC Gr Liv Area',<br>       'Garage Type Garage Area', 'Kitchen Qual Garage Finish',<br>       'Gr Liv Area Garage Type', 'Overall Qual Garage Qual',<br>       'Fireplace Qu Garage Area', 'Gr Liv Area Garage Finish',<br>       'Overall Qual Garage Type', 'MS SubClass Kitchen Qual']                                                                                                                                | **Best model:**<br>- Lasso(alpha=1010.6)<br><br>**Validation set RMSE:**<br>- Validation set: 33,178.6 | **Actual RMSE from Kaggle**<br>- Private: 29,367.2<br>- Public: 28,887.5 |
|           |                                                                                                                                                                                                                                                                                                                                    |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |                                                                                                        |                                                                          |

In [None]:
'Neighborhood Gr Liv Area', 'Neighborhood Roof Style',
       'Mas Vnr Type Gr Liv Area', 'Neighborhood Bsmt Exposure',
       'Neighborhood BsmtFin SF 1', 'Neighborhood Mas Vnr Area',
       'Land Contour Neighborhood', 'Land Contour Garage Area',
       'Roof Style Fireplace Qu', 'Lot Frontage Full Bath',
       'Lot Area Neighborhood', 'Overall Qual Overall Cond',
       'Neighborhood Open Porch SF', 'Neighborhood Mo Sold',
       'Lot Frontage Garage Type', 'Lot Frontage Exterior 1st',
       'Lot Frontage Heating QC', 'Neighborhood Wood Deck SF',
       'Neighborhood Bsmt Full Bath', 'Land Contour Fireplace Qu',
       'Lot Config Neighborhood', 'Lot Frontage BsmtFin Type 1',
       'Exter Qual Mo Sold', 'index Neighborhood',
       'Condition 1 Heating QC', 'Neighborhood Bsmt Unf SF',
       'PID Exter Qual', 'Overall Cond Bsmt Qual',
       'Mas Vnr Area Bsmt Full Bath', 'Garage Area Mo Sold',
       'Neighborhood Half Bath', 'Lot Shape Kitchen Qual',
       'Land Contour Mas Vnr Type', 'Land Contour BsmtFin SF 1',
       'Mas Vnr Area Bsmt Unf SF', 'Condition 1 Exterior 1st',
       'Lot Area Mas Vnr Type', 'Bsmt Unf SF Fireplaces',
       'Garage Finish Mo Sold', 'Lot Frontage Sale Type']

In [None]:
'Electrical', 'Garage Finish', 'Garage Qual', 'Garage Cond'

In [None]:
# For Categorical Variables

def categorical_features(df, thresh):
    lst=[]
    for col in df.columns:
        if ((df[col].dtype == 'object') | (df[col].dtype == 'int64')) & (len(df[col].unique())<=thresh):
            lst.append(col)
    return lst

cat_features = categorical_features(ames, 30)
np.array(cat_features)

In [None]:
def countplotter(df, y, ax):
    sns.countplot(
        data=ames,
        y=y,
        order = ames.groupby(y)[['SalePrice']].mean().sort_values(by='SalePrice').index,
        ax=ax,
        edgecolor=sns.color_palette("dark:black", 3)
    )
    
def boxplotter(df, x, y, ax):
    sns.boxplot(
        data=ames,
        x = x,
        y = y,
        order = ames.groupby(y)[[x]].mean().sort_values(by=x).index,
        orient='h',
        ax=ax
    )
    ax.set_xlabel(x)
    
def categorical_plot(df, cat_feat):
    for col in cat_feat:
        number_of_cat = len(df[col].unique())
        graph_height = int(number_of_cat/2)
        fig, ax = plt.subplots(nrows=1,
                               ncols=2,
                               figsize=(14, graph_height),
                               sharey=True,
                              gridspec_kw={'width_ratios': [3,1]} 
                              );
        ax=ax.ravel();
        boxplotter(df,'SalePrice',col, ax[0]);
        countplotter(df, col, ax[1]);
        plt.rcParams.update({'figure.max_open_warning': 0});

In [None]:
### For Numerical Variable

num_features = [col for col in ames.columns if col not in cat_features]
np.array(num_features)

In [None]:
def scatter_plot(df, num_feat):
    for col in num_feat:
        fig, ax = plt.subplots(nrows=1, figsize=(6, 6));
        sns.scatterplot(
        )

In [None]:
ames[['PID','SalePrice']].corr().iloc[1,0]

In [None]:
corr_dic = dict()
for col in num_features:
    if col != 'SalePrice':
        corr_dic[col]= round(abs(ames[[col,'SalePrice']].corr().iloc[1,0]),3)

In [None]:
corr = pd.DataFrame(corr_dic.items(), columns=['Numerical Variables','SalePrice Correlation (abs)'])
corr.sort_values(by='SalePrice Correlation (abs)')