<a href="https://colab.research.google.com/github/jahnavimidde/VsemML/blob/main/House_print.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df = pd.read_csv("/content/house_price_train (1).csv",encoding="utf-8",delimiter=",")
print(df.head(2))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD         Normal     208500  
1   2007        WD         Normal     181500  

[2 rows x 81 columns]


In [2]:
#Handling null values
num_fea = df.select_dtypes(include=["number"]).columns.tolist()
cat_fea = df.select_dtypes(include=["object","category"]).columns.tolist()
df[num_fea]=df[num_fea].fillna(df[num_fea].median())
df[cat_fea]=df[cat_fea].fillna("Unknown")

In [3]:
#Encode categorical features
pd.get_dummies(df,columns=cat_fea,drop_first=True)
print(df.head(3))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street    Alley LotShape  \
0   1          60       RL         65.0     8450   Pave  Unknown      Reg   
1   2          20       RL         80.0     9600   Pave  Unknown      Reg   
2   3          60       RL         68.0    11250   Pave  Unknown      IR1   

  LandContour Utilities  ... PoolArea   PoolQC    Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
1         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
2         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008        WD         Normal     208500  
1      5   2007        WD         Normal     181500  
2      9   2008        WD         Normal     223500  

[3 rows x 81 columns]


In [4]:
#Data Splitting
from sklearn.model_selection import train_test_split
X=df.drop("SalePrice",axis=1)
y=df["SalePrice"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [5]:
# Basic Filter methods
cat_fea = X_train.select_dtypes(include=["object"]).columns.tolist()
num_fea = X_train.select_dtypes(include=["number"]).columns.tolist()

X_train = pd.get_dummies(X_train, columns=cat_fea, drop_first=True)
X_train = X_train.astype(int)
X_test = pd.get_dummies(X_test, columns=cat_fea, drop_first=True)
X_test = X_test.astype(int)
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
# Reorder test columns to match train
X_test = X_test[X_train.columns]


#  Removing constant features
const = []
for features in X_train:
    if X_train[features].std() == 0:
        const.append(features)
print("Number of constant features:", len(const))
X_train.drop(labels=const, axis=1, inplace=True)
X_test.drop(labels=const, axis=1, inplace=True)  # Apply same removal to X_test

#  Removing quasi-constant features
quasi_constant = []
for feature in X_train.columns:
    predominant = (X_train[feature].value_counts() / float(len(X_train))).sort_values(ascending=False).values[0]
    if predominant > 0.999:
        quasi_constant.append(feature)
print("Number of quasi constant features:", len(quasi_constant))
X_train.drop(labels=quasi_constant, axis=1, inplace=True)
X_test.drop(labels=quasi_constant, axis=1, inplace=True)  # Apply same removal to X_test

#  Removing duplicated features
duplicates = []
for i in range(len(X_train.columns)):
    col1 = X_train.columns[i]
    for col2 in X_train.columns[i+1:]:
        if X_train[col1].equals(X_train[col2]):  # Checks content equality
            duplicates.append(col2)
print("Number of duplicate features:", len(duplicates))
X_train.drop(labels=duplicates, axis=1, inplace=True)
X_test.drop(labels=duplicates, axis=1, inplace=True, errors='ignore')  # Ignore missing columns to avoid KeyError



Number of constant features: 0
Number of quasi constant features: 23
Number of duplicate features: 16


In [6]:
# Statistical Filter Methods
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np

# Separate numerical and categorical features based on unique values in X_train
num_features = [col for col in X_train.columns if not set(X_train[col].unique()).issubset({0, 1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0, 1})]
pearson_corr = []
spearman_corr = []

for col in num_features:
    p_corr, _ = pearsonr(X_train[col], y_train)
    s_corr, _ = spearmanr(X_train[col], y_train)
    pearson_corr.append((col, p_corr))
    spearman_corr.append((col, s_corr))

pearson_df = pd.DataFrame(pearson_corr, columns=["Feature", "PearsonCorr"]).set_index("Feature")
spearman_df = pd.DataFrame(spearman_corr, columns=["Feature", "SpearmanCorr"]).set_index("Feature")
print("Top 10 features by absolute Pearson correlation:")
print(pearson_df["PearsonCorr"].abs().sort_values(ascending=False).head(10))
print("\nTop 10 features by absolute Spearman correlation:")
print(spearman_df["SpearmanCorr"].abs().sort_values(ascending=False).head(10))

# Anova
f_values, p_values = f_regression(X_train[num_features], y_train)
anova_df = pd.DataFrame({
    "Numerical Features": num_features,
    "F_values": f_values,
    "P_values": p_values
})
anova_df.sort_values(by="P_values", inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"] < 0.05]["Numerical Features"].tolist()
print("\nSignificant numerical features from ANOVA (p<0.05):")
print(significant_numeric_features)

# Mutual information
selector_mi = SelectKBest(score_func=mutual_info_regression, k="all")
selector_mi.fit(X_train[cat_features], y_train)
mi_scores = selector_mi.scores_

mi_df = pd.DataFrame({
    "Categorical Features": cat_features,
    "Mutual_Info_Scores": mi_scores
})
mi_df.sort_values(by="Mutual_Info_Scores", ascending=False, inplace=True)

# Select categorical features with MI > 0 (non-zero dependence)
significant_cat_features = mi_df[mi_df["Mutual_Info_Scores"] > 0]["Categorical Features"].tolist()
print("\nSignificant categorical features from Mutual Information (MI > 0):")
print(significant_cat_features)

final_selected_features = list(set(significant_numeric_features + significant_cat_features))

# Filter the datasets to keep only selected features
X_train_filtered = X_train[final_selected_features]
for col in final_selected_features:
    if col not in X_test.columns:
        X_test[col] = 0
X_test_filtered = X_test[final_selected_features]

Top 10 features by absolute Pearson correlation:
Feature
OverallQual     0.784720
GrLivArea       0.689238
GarageCars      0.642689
GarageArea      0.621937
TotalBsmtSF     0.590017
1stFlrSF        0.583132
FullBath        0.549164
TotRmsAbvGrd    0.519634
YearBuilt       0.512206
YearRemodAdd    0.512190
Name: PearsonCorr, dtype: float64

Top 10 features by absolute Spearman correlation:
Feature
OverallQual     0.804608
GrLivArea       0.722232
GarageCars      0.684316
YearBuilt       0.636620
GarageArea      0.636605
FullBath        0.619064
TotalBsmtSF     0.594487
YearRemodAdd    0.571357
1stFlrSF        0.565796
GarageYrBlt     0.549312
Name: SpearmanCorr, dtype: float64

Significant numerical features from ANOVA (p<0.05):
['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'GarageYrBlt', 'MasVnrArea', 'BsmtFinSF1', 'WoodDeckSF', 'LotFrontage', '2ndFlrSF', 'HalfBath', 'OpenPorchS

In [7]:
#Dataset after preprocessing and feature selection
print(X_train.head(3))

        Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
135    136          20           80    10400            7            6   
1452  1453         180           35     3675            5            5   
762    763          60           72     8640            7            5   

      YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_Con  \
135        1970          1970         288           0  ...             0   
1452       2005          2005          80         547  ...             0   
762        2009          2009           0          24  ...             1   

      SaleType_ConLD  SaleType_ConLw  SaleType_New  SaleType_WD  \
135                0               0             0            1   
1452               0               0             0            1   
762                0               0             0            0   

      SaleCondition_AdjLand  SaleCondition_Alloca  SaleCondition_Family  \
135                       0                     0     