## CHI SQUARED & CRAMER'S V TEST

### FOR CATEGORICAL/QUANTITATIVE FEATURES

In [1]:
import pandas as pd

*just cleaned data, not standardized and not encoded.*

In [2]:
df_train = pd.read_csv(r"C:\Users\leila\Downloads\pfaproject\house-price-prediction\data\raw data\train.csv")


In [3]:
### 1. Traitement des valeurs manquantes selon la description ###

# Variables catégorielles avec "NA" dans la description
na_categorical = {
    "Alley": "NA",  # No alley access
    "BsmtQual": "NA",  # No Basement
    "BsmtCond": "NA",  # No Basement
    "BsmtExposure": "NA",  # No Basement
    "BsmtFinType1": "NA",  # No Basement
    "BsmtFinType2": "NA",  # No Basement
    "FireplaceQu": "NA",  # No Fireplace
    "GarageType": "NA",  # No Garage
    "GarageFinish": "NA",  # No Garage
    "GarageQual": "NA",  # No Garage
    "GarageCond": "NA",  # No Garage
    "PoolQC": "NA",  # No Pool
    "Fence": "NA",  # No Fence
    "MiscFeature": "NA"  # None
}

for col, na_value in na_categorical.items():
    df_train[col] = df_train[col].fillna(na_value)

# Variables avec "None" comme option valide
none_categorical = {
    "MasVnrType": "None"  # Masonry veneer type
}
for col, none_value in none_categorical.items():
    df_train[col] = df_train[col].fillna(none_value)

# Variables numériques
df_train["LotFrontage"] = df_train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
df_train["MasVnrArea"] = df_train["MasVnrArea"].fillna(0)  # Si pas de maçonnerie
df_train["GarageYrBlt"] = df_train["GarageYrBlt"].fillna(0)  # Si pas de garage

# Electrical - seul NA, utiliser le mode
df_train["Electrical"] = df_train["Electrical"].fillna(df_train["Electrical"].mode()[0])

### 2. Vérification des relations entre variables ###

# Si pas de sous-sol, surfaces = 0
bsmt_cols = ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF"]
df_train.loc[df_train["BsmtQual"] == "NA", bsmt_cols] = 0

# Si pas de garage, surfaces = 0
df_train.loc[df_train["GarageType"] == "NA", ["GarageCars", "GarageArea"]] = 0

# Si pas de cheminée
df_train.loc[df_train["Fireplaces"] == 0, "FireplaceQu"] = "NA"

# Si pas de piscine
df_train.loc[df_train["PoolArea"] == 0, "PoolQC"] = "NA"

### 3. Vérification finale ###
print("Valeurs manquantes restantes:")
print(df_train.isnull().sum().sum()) 

Valeurs manquantes restantes:
0


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

*select only categorical*

In [5]:
categorical_df = df_train.select_dtypes(include=['object', 'bool', 'category'])

In [6]:
for col in categorical_df.columns:
    categorical_df[col] = categorical_df[col].astype('category')


In [23]:
from scipy.stats import chi2_contingency
import numpy as np
from itertools import combinations

# Iterate through all pairs of categorical variables
for var1, var2 in combinations(categorical_df.columns, 2):
    contingency_table = pd.crosstab(categorical_df[var1], categorical_df[var2])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    min_dim = min(contingency_table.shape) - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))
    print(f"Cramér's V between {var1} and {var2}: {cramers_v:.4f}")


Cramér's V between MSZoning and Street: 0.2547
Cramér's V between MSZoning and Alley: 0.3921
Cramér's V between MSZoning and LotShape: 0.1608
Cramér's V between MSZoning and LandContour: 0.1147
Cramér's V between MSZoning and Utilities: 0.0136
Cramér's V between MSZoning and LotConfig: 0.0825
Cramér's V between MSZoning and LandSlope: 0.0888
Cramér's V between MSZoning and Neighborhood: 0.6525
Cramér's V between MSZoning and Condition1: 0.1027
Cramér's V between MSZoning and Condition2: 0.0908
Cramér's V between MSZoning and BldgType: 0.1954
Cramér's V between MSZoning and HouseStyle: 0.1968
Cramér's V between MSZoning and RoofStyle: 0.0934
Cramér's V between MSZoning and RoofMatl: 0.0317
Cramér's V between MSZoning and Exterior1st: 0.2029
Cramér's V between MSZoning and Exterior2nd: 0.2119
Cramér's V between MSZoning and MasVnrType: 0.1112
Cramér's V between MSZoning and ExterQual: 0.2449
Cramér's V between MSZoning and ExterCond: 0.0947
Cramér's V between MSZoning and Foundation: 0.2

In [20]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations

# Define a function to compute Cramér's V
def cramers_v(x, y):
    contingency_table = pd.crosstab(x, y)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    min_dim = min(contingency_table.shape) - 1
    if min_dim == 0:
        return np.nan  # Avoid division by zero
    return np.sqrt(chi2 / (n * min_dim))

# Assuming 'categorical_df' is your DataFrame with categorical variables
# If not already defined, you can create it as follows:
# categorical_df = df.select_dtypes(include=['object', 'category', 'bool'])

# Compute Cramér's V for all pairs of categorical variables
results = []
for var1, var2 in combinations(categorical_df.columns, 2):
    v = cramers_v(categorical_df[var1], categorical_df[var2])
    if not np.isnan(v):
        results.append({'Variable 1': var1, 'Variable 2': var2, "Cramér's V": v})

# Create a DataFrame with the results
cramers_df = pd.DataFrame(results)

# Sort the DataFrame by Cramér's V in descending order and select the top 20 pairs
top_20 = cramers_df.sort_values(by="Cramér's V", ascending=False).head(20)

# Display the top 20 pairs
print(top_20)


       Variable 1     Variable 2  Cramér's V
525   Exterior1st    Exterior2nd    0.762010
875    GarageQual     GarageCond    0.705464
858    GarageType   GarageFinish    0.687219
7        MSZoning   Neighborhood    0.652480
867  GarageFinish     GarageQual    0.591226
868  GarageFinish     GarageCond    0.588688
674      BsmtQual   BsmtFinType1    0.579381
614     ExterQual    KitchenQual    0.547358
650    Foundation       BsmtQual    0.533869
672      BsmtQual       BsmtCond    0.528628
713  BsmtExposure   BsmtFinType1    0.524791
673      BsmtQual   BsmtExposure    0.522787
694      BsmtCond   BsmtFinType1    0.510317
675      BsmtQual   BsmtFinType2    0.503431
317  Neighborhood      ExterQual    0.501422
695      BsmtCond   BsmtFinType2    0.497917
693      BsmtCond   BsmtExposure    0.497112
714  BsmtExposure   BsmtFinType2    0.494108
320  Neighborhood       BsmtQual    0.482304
902      SaleType  SaleCondition    0.475809


In [25]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
import pandas as pd

categorical_df['SalePrice'] = df_train['SalePrice']

X = categorical_df
y = df_train['SalePrice']

# Assuming X is your feature DataFrame and y is the target variable
# Encode categorical variables
X_encoded = X.apply(LabelEncoder().fit_transform)
chi_scores, p_values = chi2(X_encoded, y)

# Create a DataFrame with scores
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi_scores, 'p-value': p_values})
chi2_df.sort_values('Chi2 Score', ascending=False, inplace=True)


In [27]:
chi2_df

Unnamed: 0,Feature,Chi2 Score,p-value
43,SalePrice,143517.249363,0.0
8,Neighborhood,2194.267106,3.242308e-163
11,BldgType,1951.478245,2.81882e-127
27,HeatingQC,1483.144925,7.831578999999999e-65
33,GarageType,1264.929866,3.419318e-40
24,BsmtFinType1,1038.132199,3.8000219999999995e-19
7,LandSlope,889.793781,6.859676e-09
12,HouseStyle,812.88772,5.096798e-05
16,Exterior2nd,808.619925,7.736118e-05
3,LotShape,802.972897,0.0001326098


In [28]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_encoded, y)
mi_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})
mi_df.sort_values('MI Score', ascending=False, inplace=True)


In [29]:
mi_df

Unnamed: 0,Feature,MI Score
43,SalePrice,5.857637
1,Street,3.150541
10,Condition2,3.051202
14,RoofMatl,3.024798
38,PoolQC,3.023505
40,MiscFeature,2.993597
26,Heating,2.991995
37,PavedDrive,2.775116
31,Functional,2.750548
28,CentralAir,2.737914


In [30]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_encoded, y)
importances = model.feature_importances_

rf_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
rf_df.sort_values('Importance', ascending=False, inplace=True)


In [31]:
rf_df

Unnamed: 0,Feature,Importance
43,SalePrice,0.146519
8,Neighborhood,0.059989
24,BsmtFinType1,0.050269
16,Exterior2nd,0.044091
15,Exterior1st,0.042208
32,FireplaceQu,0.041531
12,HouseStyle,0.038143
23,BsmtExposure,0.035475
34,GarageFinish,0.033494
6,LotConfig,0.032116
