In [100]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from prince import MCA
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
# Load data set
df = pd.read_csv('train.csv')
df = df.drop('Id', axis = 1)

# Remove columns that have too many missing values
df = df.drop(df.columns[df.isnull().sum() > 30], axis = 1)

# Remove missing values
df.dropna(inplace = True)

df

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,20,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,70,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,20,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,112,0,0,0,0,4,2010,WD,Normal,142125


In [3]:
print(df.isnull().sum().sum())

0


In [23]:
# # Step 1: Remove columns with too many missing values
# threshold_missing_values = 0.3  # Adjust this threshold as needed

# # Calculate the percentage of missing values for each column
# missing_percentage = df.isnull().mean()

# # Identify columns exceeding the threshold
# columns_to_drop = missing_percentage[missing_percentage > threshold_missing_values].index

# # Drop the identified columns
# df = df.drop(columns=columns_to_drop)

# # Step 2: Remove remaining missing values
# # For simplicity, you can drop rows with any missing values
# df = df.dropna()

In [30]:
# Check for NaN or infinite values in the DataFrame
print(df.isnull().sum())  # Display NaN counts per column
print(df.isin([np.inf, -np.inf]).sum())  # Display counts of infinite values per column

# Remove rows with NaN or infinite values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 76, dtype: int64
Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 76, dtype: int64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1451 entries, 0 to 1459
Data columns (total 64 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1451 non-null   int64  
 1   MSZoning       1451 non-null   object 
 2   LotArea        1451 non-null   int64  
 3   Street         1451 non-null   object 
 4   LotShape       1451 non-null   object 
 5   LandContour    1451 non-null   object 
 6   Utilities      1451 non-null   object 
 7   LotConfig      1451 non-null   object 
 8   LandSlope      1451 non-null   object 
 9   Neighborhood   1451 non-null   object 
 10  Condition1     1451 non-null   object 
 11  Condition2     1451 non-null   object 
 12  BldgType       1451 non-null   object 
 13  HouseStyle     1451 non-null   object 
 14  OverallQual    1451 non-null   int64  
 15  OverallCond    1451 non-null   int64  
 16  YearBuilt      1451 non-null   int64  
 17  YearRemodAdd   1451 non-null   int64  
 18  RoofStyl

1. Apply MCA onto the data set:

In [5]:
# Step 1: Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=100)

In [6]:
# Step 2: Separate numerical and categorical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

In [15]:
'''
This is the script you can use to check the categorical features between training and testing sets.
You can apply this code after you can done the train/test split, as well as separation of numerical
and categorical features.
'''
X_train_cat = train_data[categorical_features]
X_test_cat = test_data[categorical_features]

# Suppose your categorical features in training set is called X_train_cat.
# Suppose your categorical features in testing set is called X_test_set.
# Make sure the training feature and testing feature has same number of levels
keep = X_train_cat.nunique() == X_test_cat.nunique()
X_train_cat = X_train_cat[X_train_cat.columns[keep]]
X_test_cat = X_test_cat[X_test_cat.columns[keep]]

# For categorical features that have same levels, make sure the classes are the same
keep = []
for i in range(X_train_cat.shape[1]):
    keep.append(all(np.sort(X_train_cat.iloc[:,i].unique()) == np.sort(X_test_cat.iloc[:,i].unique())))
X_train_cat = X_train_cat[X_train_cat.columns[keep]]
X_test_cat = X_test_cat[X_test_cat.columns[keep]]


In [7]:
# Step 3: Ensure that both train and test datasets have the same set of levels for each categorical variable
for cat_col in categorical_features:
    train_levels = set(train_data[cat_col].unique())
    test_levels = set(test_data[cat_col].unique())
    all_levels = train_levels.union(test_levels)
    train_data[cat_col] = train_data[cat_col].astype('category').cat.set_categories(all_levels)
    test_data[cat_col] = test_data[cat_col].astype('category').cat.set_categories(all_levels)


In [16]:
# Step 4: Apply PCA on numerical features
pca = PCA(n_components=2)
train_pca_result = pca.fit_transform(train_data[numerical_features])
test_pca_result = pca.transform(test_data[numerical_features])

In [34]:
train_pca_result.shape

(1160, 2)

In [21]:
X_train_cat

Unnamed: 0,MSZoning,LotShape,LandContour,LotConfig,LandSlope,BldgType,MasVnrType,ExterQual,CentralAir,KitchenQual,PavedDrive
624,RL,Reg,Lvl,Inside,Gtl,1Fam,,TA,Y,TA,Y
592,RL,Reg,Lvl,Inside,Gtl,1Fam,,TA,Y,TA,Y
48,RM,Reg,Lvl,Inside,Gtl,2fmCon,,TA,Y,TA,N
1070,RL,Reg,Lvl,Inside,Gtl,1Fam,BrkFace,TA,Y,TA,Y
1234,RH,Reg,Bnk,Inside,Gtl,1Fam,,TA,N,TA,N
...,...,...,...,...,...,...,...,...,...,...,...
805,RL,IR1,Lvl,FR2,Gtl,1Fam,Stone,Gd,Y,Ex,Y
53,RL,IR1,Low,Inside,Gtl,1Fam,,Gd,Y,Gd,Y
351,RL,IR1,Low,Inside,Mod,1Fam,,TA,Y,Gd,Y
79,RM,Reg,Lvl,Corner,Gtl,1Fam,,TA,Y,TA,Y


In [49]:
one_hot = pd.get_dummies(train_data[categorical_features])
one_hot

Unnamed: 0,MSZoning_FV,MSZoning_C (all),MSZoning_RH,MSZoning_RM,MSZoning_RL,Street_Grvl,Street_Pave,LotShape_IR3,LotShape_IR2,LotShape_IR1,...,SaleType_ConLD,SaleType_Oth,SaleType_Con,SaleType_ConLw,SaleCondition_Abnorml,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_Family,SaleCondition_AdjLand,SaleCondition_Alloca
449,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
901,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
550,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1115,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
226,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1071,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
67,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
466,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
103,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [27]:
# Step 5: Apply MCA on categorical features
mca = MCA(n_components=2)
train_mca_result = mca.fit_transform(X_train_cat)
test_mca_result = mca.transform(X_test_cat)

In [52]:
train_pca_df = pd.DataFrame(train_pca_result, columns=['pca_1', 'pca_2'])
test_pca_df = pd.DataFrame(test_pca_result, columns=['pca_1', 'pca_2'])

In [90]:
train_pca_df

Unnamed: 0,pca_1,pca_2
0,-15953.486168,392.959408
1,-43223.749866,-2459.400938
2,-68280.915342,-3739.495588
3,-46096.773033,1195.063397
4,-51145.742408,-260.485529
...,...,...
1155,46602.917621,93.784026
1156,205150.038968,32618.919571
1157,8704.845484,-5585.607817
1158,-71073.465811,2343.578482


In [93]:
test_mca_result

Unnamed: 0,0,1
357,0.296526,-0.507883
824,-0.375315,-0.421014
369,-0.145580,0.159588
291,2.200111,0.146006
134,0.060756,0.090473
...,...,...
1424,0.190777,-0.102245
1130,-0.012936,-0.108776
899,0.190777,-0.102245
932,-0.790089,0.193782


In [94]:
# Resetting indices
train_mca_result = train_mca_result.reset_index(drop=True)
test_mca_result = test_mca_result.reset_index(drop=True)

In [97]:
# Concatenate the PCA and MCA results
X_train = pd.concat([train_pca_df, train_mca_result], axis=1)
X_test = pd.concat([test_pca_df, test_mca_result], axis= 1, ignore_index=True)

In [103]:
# Perform regression using the reduced features
model = Ridge()
model.fit(X_train, train_data['SalePrice'])

# Predict and evaluate on the testing set
y_pred = model.predict(X_test)
mse = mean_squared_error(test_data['SalePrice'], y_pred)
print(f'Mean Squared Error (PCA + MCA): {mse}')


<IPython.core.display.Javascript object>

Mean Squared Error (PCA + MCA): 6.8229257761117


