In [1]:
pip install spacy 

Note: you may need to restart the kernel to use updated packages.


In [2]:
import spacy


In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# 1. Importing the data 

In [2]:
data=pd.read_csv("test.csv")

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Price
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,13300000
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,12250000
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,12250000
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,12215000
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,11410000


In [4]:
print(data.shape)

(1459, 81)


In [5]:
print(data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# 2. Dealing with the missing values 

In [6]:
mask=data.isna()

In [7]:
count_na=mask.sum()

In [8]:
count_na

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Price              0
Length: 81, dtype: int64

In [9]:
# dropping columns with more than 500 missing values 

In [10]:
columns_to_drop = count_na[count_na > 500].index.tolist()

In [11]:
data_clean = data.drop(columns_to_drop, axis=1)

In [12]:
data_clean.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Price
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0,6,2010,WD,Normal,13300000
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,12500,6,2010,WD,Normal,12250000
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,3,2010,WD,Normal,12250000
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,6,2010,WD,Normal,12215000
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,...,0,0,144,0,0,1,2010,WD,Normal,11410000


In [13]:
data_clean.isna().sum().sort_values(ascending=False)


LotFrontage     227
GarageFinish     78
GarageYrBlt      78
GarageQual       78
GarageCond       78
               ... 
CentralAir        0
Electrical        0
1stFlrSF          0
2ndFlrSF          0
Price             0
Length: 76, dtype: int64

In [14]:
print(data_clean.dtypes)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Price              int64
Length: 76, dtype: object


In [15]:
# filling all na values with mode value 

In [16]:
data_clean1=data_clean.fillna(data_clean.mode().iloc[0])

In [17]:
data_clean1.isna().sum().sort_values(ascending=False)


Id              0
FullBath        0
Fireplaces      0
Functional      0
TotRmsAbvGrd    0
               ..
MasVnrType      0
Exterior2nd     0
Exterior1st     0
RoofMatl        0
Price           0
Length: 76, dtype: int64

# 3 . Encoding the categorical variables 

In [18]:
categorical_vars = data_clean1.select_dtypes(include=['object']).columns

In [19]:
categorical_vars 

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [20]:
# encoding them 

In [21]:
for var in categorical_vars:
    data_clean1[var] = data_clean1[var].astype('category').cat.codes
    

In [22]:
print(data_clean1.dtypes)

Id                 int64
MSSubClass         int64
MSZoning            int8
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType            int8
SaleCondition       int8
Price              int64
Length: 76, dtype: object


In [23]:
data_clean1.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Price
0,1461,20,2,80.0,11622,1,3,3,0,4,...,0,0,120,0,0,6,2010,8,4,13300000
1,1462,20,3,81.0,14267,1,0,3,0,0,...,0,0,0,0,12500,6,2010,8,4,12250000
2,1463,60,3,74.0,13830,1,0,3,0,4,...,0,0,0,0,0,3,2010,8,4,12250000
3,1464,60,3,78.0,9978,1,0,3,0,4,...,0,0,0,0,0,6,2010,8,4,12215000
4,1465,120,3,43.0,5005,1,0,1,0,4,...,0,0,144,0,0,1,2010,8,4,11410000
5,1466,60,3,75.0,10000,1,0,3,0,0,...,0,0,0,0,0,4,2010,8,4,10850000
6,1467,20,3,60.0,7980,1,0,3,0,4,...,0,0,0,0,500,3,2010,8,4,10150000
7,1468,60,3,63.0,8402,1,0,3,0,4,...,0,0,0,0,0,5,2010,8,4,10150000
8,1469,20,3,85.0,10176,1,3,3,0,4,...,0,0,0,0,0,2,2010,8,4,9870000
9,1470,20,3,70.0,8400,1,3,3,0,0,...,0,0,0,0,0,4,2010,8,4,9800000


# 4. creating the contingency tables 

#1. coningeny tables between two categorical variables 

In [24]:
# Neighborhood and house style  

In [25]:
contingency_table = pd.crosstab(data['Neighborhood'], data['HouseStyle'])

In [26]:
print(contingency_table)

HouseStyle    1.5Fin  1.5Unf  1Story  2.5Unf  2Story  SFoyer  SLvl
Neighborhood                                                      
Blmngtn            0       0      11       0       0       0     0
Blueste            0       0       3       0       5       0     0
BrDale             0       0       0       0      14       0     0
BrkSide           31       2      12       1       4       0     0
ClearCr            2       0      11       0       2       0     1
CollgCr            0       0      67       0      45       2     3
Crawfor           12       0      22       3      13       0     2
Edwards           21       0      49       1      10       8     5
Gilbert            0       0      15       0      59       0    12
IDOTRR            20       1      20       1      11       1     2
MeadowV            0       0       2       0      14       2     2
Mitchel            1       0      35       0      11      13     5
NAmes             14       2     172       0      13       6  

In [27]:
# 2 zoning vs type of dwelling 

In [28]:
contingency_table1 = pd.crosstab(data['MSZoning'], data['MSSubClass'])

In [29]:
print(contingency_table1)

MSSubClass  20   30   40   45   50   60   70   75   80   85   90   120  150  \
MSZoning                                                                      
C (all)       1    6    0    0    3    0    3    0    0    0    0    0    0   
FV           21    0    0    0    0   18    0    0    0    0    0   14    0   
RH            1    1    0    0    1    0    0    0    0    0    1    4    0   
RL          508   28    2    2   71  256   27    3   57   27   49   58    1   
RM           10   34    0    4   68    2   37    4    3    1    7   19    0   

MSSubClass  160  180  190  
MSZoning                   
C (all)       0    0    2  
FV           21    0    0  
RH            0    0    2  
RL           10    0   15  
RM           34    7   12  


In [30]:
# 3 heating quality and central air 

In [31]:
contingency_table2 = pd.crosstab(data['CentralAir'], data['HeatingQC'])

In [32]:
contingency_table2

HeatingQC,Ex,Fa,Gd,Po,TA
CentralAir,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,8,18,16,1,58
Y,744,25,217,1,371


In [33]:
# 4 fireplace quality and number of fire places 

In [34]:
contingency_table3=pd.crosstab(data['FireplaceQu'],data['Fireplaces'])

In [35]:
contingency_table3

Fireplaces,1,2,3,4
FireplaceQu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ex,18,1,0,0
Fa,35,6,0,0
Gd,303,58,3,0
Po,26,0,0,0
TA,236,39,3,1


In [36]:
# 4 type of roof and roof material 

In [37]:
contingency_table4=pd.crosstab(data['RoofStyle'],data['RoofMatl'])

In [38]:
contingency_table4

RoofMatl,CompShg,Tar&Grv,WdShake,WdShngl
RoofStyle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Flat,0,7,0,0
Gable,1164,5,0,0
Gambrel,11,0,0,0
Hip,262,0,3,0
Mansard,3,0,1,0
Shed,2,0,0,1


In [39]:
# 5 elctrical system , heating quality and air conditioning 

In [40]:
contingency_table5=pd.crosstab(data['Electrical'],data['CentralAir'])

In [41]:
contingency_table5

CentralAir,N,Y
Electrical,Unnamed: 1_level_1,Unnamed: 2_level_1
FuseA,28,66
FuseF,12,11
FuseP,4,1
SBrkr,57,1280


# 5 Using Chi-square to determine whethr there is a significant difference between the variables 

In [42]:
import pandas as pd
from scipy.stats import chi2_contingency

In [43]:
# 1 for the first contingency table 

In [44]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 907.7995948640149
P-value: 4.591814585050864e-111
Degrees of freedom: 144
Expected frequencies: [[1.20630569e+00 3.76970528e-02 5.61686086e+00 9.80123372e-02
  3.21932831e+00 3.46812886e-01 4.74982865e-01]
 [8.77313228e-01 2.74160384e-02 4.08498972e+00 7.12816998e-02
  2.34132968e+00 2.52227553e-01 3.45442084e-01]
 [1.53529815e+00 4.79780672e-02 7.14873201e+00 1.24742975e-01
  4.09732694e+00 4.41398218e-01 6.04523646e-01]
 [5.48320768e+00 1.71350240e-01 2.55311857e+01 4.45510624e-01
  1.46333105e+01 1.57642221e+00 2.15901302e+00]
 [1.75462646e+00 5.48320768e-02 8.16997944e+00 1.42563400e-01
  4.68265936e+00 5.04455106e-01 6.90884167e-01]
 [1.28307060e+01 4.00959561e-01 5.97429746e+01 1.04249486e+00
  3.42419465e+01 3.68882796e+00 5.05209047e+00]
 [5.70253598e+00 1.78204249e-01 2.65524332e+01 4.63331049e-01
  1.52186429e+01 1.63947910e+00 2.24537354e+00]
 [1.03084304e+01 3.22138451e-01 4.79986292e+01 8.37559973e-01
  2.75106237e+01 2.96367375e+00 4.05894448e+00]
 [

- the results indicate a signiifcant relationship between the two variables 

In [45]:
# second contingency table 

In [46]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table1)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 776.9974114690623
P-value: 2.8609492282874874e-125
Degrees of freedom: 60
Expected frequencies: [[5.57731959e+00 7.11340206e-01 2.06185567e-02 6.18556701e-02
  1.47422680e+00 2.84536082e+00 6.90721649e-01 7.21649485e-02
  6.18556701e-01 2.88659794e-01 5.87628866e-01 9.79381443e-01
  1.03092784e-02 6.70103093e-01 7.21649485e-02 3.19587629e-01]
 [2.75147766e+01 3.50927835e+00 1.01718213e-01 3.05154639e-01
  7.27285223e+00 1.40371134e+01 3.40756014e+00 3.56013746e-01
  3.05154639e+00 1.42405498e+00 2.89896907e+00 4.83161512e+00
  5.08591065e-02 3.30584192e+00 3.56013746e-01 1.57663230e+00]
 [3.71821306e+00 4.74226804e-01 1.37457045e-02 4.12371134e-02
  9.82817869e-01 1.89690722e+00 4.60481100e-01 4.81099656e-02
  4.12371134e-01 1.92439863e-01 3.91752577e-01 6.52920962e-01
  6.87285223e-03 4.46735395e-01 4.81099656e-02 2.13058419e-01]
 [4.14208935e+02 5.28288660e+01 1.53127148e+00 4.59381443e+00
  1.09485911e+02 2.11315464e+02 5.12975945e+01 5.35945017e+00
  4.5938144

- the results indicate a significant relationship between the two variables since the p-value is < than 0.05

In [47]:
# third contingency table 

In [48]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table2)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 156.26104680755472
P-value: 9.261660470996078e-33
Degrees of freedom: 4
Expected frequencies: [[5.20575737e+01 2.97669637e+00 1.61295408e+01 1.38450994e-01
  2.96977382e+01]
 [6.99942426e+02 4.00233036e+01 2.16870459e+02 1.86154901e+00
  3.99302262e+02]]


- the results show a significant relationship between the two variables since the p-value < 0.05

In [49]:
# fourth contingency table 

In [50]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table3)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 9.092101117705248
P-value: 0.6950442879792258
Degrees of freedom: 12
Expected frequencies: [[1.61069959e+01 2.71056241e+00 1.56378601e-01 2.60631001e-02]
 [3.47572016e+01 5.84910837e+00 3.37448560e-01 5.62414266e-02]
 [3.08576132e+02 5.19286694e+01 2.99588477e+00 4.99314129e-01]
 [2.20411523e+01 3.70919067e+00 2.13991770e-01 3.56652949e-02]
 [2.36518519e+02 3.98024691e+01 2.29629630e+00 3.82716049e-01]]


- the p-value is > 0.05 and so , there is no significant relationship between the two variables 

In [51]:
# fifth contingency table 

In [52]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table4)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 1434.1203784563504
P-value: 7.54474301628423e-297
Degrees of freedom: 15
Expected frequencies: [[6.91843729e+00 5.75736806e-02 1.91912269e-02 4.79780672e-03]
 [1.15537903e+03 9.61480466e+00 3.20493489e+00 8.01233722e-01]
 [1.08718300e+01 9.04729267e-02 3.01576422e-02 7.53941056e-03]
 [2.61912269e+02 2.17957505e+00 7.26525017e-01 1.81631254e-01]
 [3.95339273e+00 3.28992461e-02 1.09664154e-02 2.74160384e-03]
 [2.96504455e+00 2.46744345e-02 8.22481151e-03 2.05620288e-03]]


- The results indicate that there exists a significant relationship between the two variables 

In [53]:
# sixth contingency table 

In [54]:
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table5)

# print the results
print('Chi-square statistic:', chi2_stat)
print('P-value:', p_val)
print('Degrees of freedom:', dof)
print('Expected frequencies:', expected)

Chi-square statistic: 205.47731645673883
P-value: 2.7643181719339983e-44
Degrees of freedom: 3
Expected frequencies: [[6.50719671e+00 8.74928033e+01]
 [1.59218643e+00 2.14078136e+01]
 [3.46127485e-01 4.65387252e+00]
 [9.25544894e+01 1.24444551e+03]]


- The results indicate that there exists a significant relationship between the two variables 

# 6. Corresponence analysis 

In [55]:
import prince
from prince import MCA
import seaborn as sns
import matplotlib.pyplot as plt
from prince import CA

In [56]:
# sixth contingency table

In [57]:
ca=prince.CA(
    n_components=2,
    n_iter=3,
    copy=True,
    check_input=True,
    engine="sklearn",
    random_state=42)

In [58]:
contingency_table5=pd.crosstab(data_clean['Electrical'],data_clean['CentralAir'])

In [59]:
ca = prince.CA(n_components=2)
ca=ca.fit(contingency_table5)

In [60]:
summary = {
    'Total inertia': ca.total_inertia_,
}
summary

{'Total inertia': 0.14083434986753862}

In [61]:
sv = np.sqrt(ca.total_inertia_)

In [62]:
# explained variance for each dimension 
explained_var = sv**2 / np.sum(sv**2)
print('Explained variance:', explained_var)

Explained variance: 1.0


In [63]:
u, s, vh = np.linalg.svd(contingency_table5)
eigenvalues = (s ** 2) / contingency_table5.values.sum()

In [64]:
print("Eigen Values:",eigenvalues)

Eigen Values: [1.12837448e+03 5.29563346e-01]


- since the eigen value for the first dimension is greater than 1 , there exists a relationship between the twoo variables

In [65]:
# fifth contingency table

In [66]:
ca2=ca.fit(contingency_table4)

In [67]:
summary = {
    'Total inertia': ca2.total_inertia_,
}
summary

{'Total inertia': 0.9829474835204596}

In [68]:
sv2 = np.sqrt(ca.total_inertia_)

In [69]:
# explained variance for each dimension 
explained_var2 = sv2**2 / np.sum(sv2**2)
print('Explained variance:', explained_var2)

Explained variance: 1.0


In [70]:
u, s, vh = np.linalg.svd(contingency_table4)
eigenvalues1 = (s ** 2) / contingency_table4.values.sum()

In [71]:
print("Eigen Values:",eigenvalues1)

Eigen Values: [9.75804133e+02 3.45862263e-02 6.37996009e-03 6.85397881e-04]


- since the first dimension has an eigen value of greater than 1 , there exists a relationship but it is not quite significant 

In [72]:
# fourth contingency table 

In [73]:
ca3=ca.fit(contingency_table3)

In [74]:
summary = {
    'Total inertia': ca3.total_inertia_,
}
summary

{'Total inertia': 0.012472017994108705}

In [75]:
sv3 = np.sqrt(ca.total_inertia_)

In [76]:
# explained variance for each dimension 
explained_var3 = sv3**2 / np.sum(sv3**2)
print('Explained variance:', explained_var3)

Explained variance: 1.0


In [77]:
u, s, vh = np.linalg.svd(contingency_table3)
eigenvalues2 = (s ** 2) / contingency_table3.values.sum()

In [78]:
print("Eigen values:",eigenvalues2)

Eigen values: [2.12099270e+02 6.82129796e-02 1.18512647e-03 5.63886109e-05]


- since the first dimension has an eigen value of greater than 1 , there exists a relationship but it is not quite significant 

# 7 . machine learning methods 

#1. Decision trees 

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics 

In [80]:
import dtreeviz

In [81]:
# fitting the model with all the variables 

In [82]:
X=data_clean1.drop('Price',axis=1)
y=data_clean1['Price']

In [83]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

#1.  the decision tree classifier 

In [84]:
clf=DecisionTreeRegressor()

In [85]:
clf.fit(X_train,y_train)
tr_ee=dtreeviz.model(clf, X_train,y_train, target_name='Price', feature_names=X_train.columns)

In [86]:
# making predictions 

In [87]:
y_pred = clf.predict(X_test)

In [88]:
# model evaluation 

In [89]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.14840182648401826


In [90]:
# important variables 


In [91]:
tr_ee

<dtreeviz.trees.DTreeVizAPI at 0x26a849e52d0>

In [93]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, clf.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val

In [94]:
for feature, importance in feat_dict.items():
    print(f"Feature: {feature}, Importance: {importance}")

Feature: Id, Importance: 0.36155033393791686
Feature: TotalBsmtSF, Importance: 0.06403576228340778
Feature: BsmtFinSF1, Importance: 0.0484183567004436
Feature: Neighborhood, Importance: 0.03311880389671682
Feature: BsmtUnfSF, Importance: 0.03305727965574485
Feature: GrLivArea, Importance: 0.031999537395042243
Feature: LotArea, Importance: 0.03137769570722922
Feature: MoSold, Importance: 0.030308303223316645
Feature: 1stFlrSF, Importance: 0.02655454891151274
Feature: OpenPorchSF, Importance: 0.026486378524528252
Feature: WoodDeckSF, Importance: 0.022204003355062688
Feature: GarageYrBlt, Importance: 0.022087804464763515
Feature: LotFrontage, Importance: 0.018843777161812417
Feature: MasVnrArea, Importance: 0.016510242049175432
Feature: YearBuilt, Importance: 0.015794360424561456
Feature: GarageArea, Importance: 0.015434088589853985
Feature: LotConfig, Importance: 0.014767024194123821
Feature: TotRmsAbvGrd, Importance: 0.014250989631945427
Feature: GarageFinish, Importance: 0.013207445646

In [None]:
# Based on the results some of the most important categorical variables include : Neighborhood , LotShape , MSZoning , MSSubClass and OverallCond to mention a few  

In [95]:
# 2. Naive Bayes Classifier 

In [96]:
from sklearn.naive_bayes import GaussianNB

In [97]:
gnb = GaussianNB()

In [98]:
gnb.fit(X_train,y_train)

In [99]:
# make predictions 

In [101]:
y_pred1 = gnb.predict(X_test)

In [102]:
accuracy = metrics.accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)

Accuracy: 0.0547945205479452


# 8 . Apriori algorithm 

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
# overall quality and overall condition

In [None]:
df_apri=data[['OverallCond','OverallQual']]

In [None]:
df_apri

In [None]:
# convert to categorical 

In [None]:
bins = [0, 5, 7, np.inf]
labels = ['Low', 'Medium', 'High']
df_apri['OverallCond_cat'] = pd.cut(df_apri['OverallCond'], bins=bins, labels=labels)
df_apri['OverallQual_cat'] = pd.cut(df_apri['OverallQual'], bins=bins, labels=labels)

In [None]:
df=df_apri.drop(['OverallCond','OverallQual'],axis=1)

In [None]:
te = TransactionEncoder()
te_ary = te.fit(df.values).transform(df.values)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

#1. finding frequent terms in overall quality and condition of a house 

In [None]:
frequent_itemsets = apriori(df_encoded, min_support=0.2, use_colnames=True)


In [None]:
frequent_itemsets

In [None]:
# association rules 

In [None]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)


In [None]:
# Display frequent itemsets

In [None]:
print("Frequent Itemsets:")
print(frequent_itemsets)

In [None]:
# Display association rules

In [None]:
print("\nAssociation Rules:")
print(rules)

-The results show that most of the houses had a low rating in regard to both overall quality and condition 
-since the itemset low had has a support of 0.822481, indicating that the category "Low" for either OverallQual or OverallCond 
appears in approximately 82.25% of the data