## Exercise 6: Choosing the best performing model on a dataset

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
- Use all Regression models

Submit your results to:
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview



In [7]:
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

## Dataset File

In [8]:
train_data = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/train.csv?raw=true'
df = pd.read_csv(train_data)

## Test File

In [9]:
test_url = 'https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/test.csv?raw=true'
dt=pd.read_csv(test_url)

In [10]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

## Sample Submission File

In [11]:
sample_submission_url ='https://github.com/robitussin/CCMACLRL_EXERCISES/blob/3fd7d51ffd17863598ac3f44eeefc558171a5b73/dataset/house-prices-advanced-regression-techniques/sample_submission.csv?raw=true'

sf=pd.read_csv(sample_submission_url)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [13]:
df.isnull().sum

In [18]:
# Concatenate the training and test dataframes for feature engineering
# Exclude the 'SalePrice' column from df_encoded before concatenating
combined_df = pd.concat([df_encoded.drop(columns=['SalePrice']), dt_encoded], ignore_index=True)

# Impute missing numerical values with the mean
numerical_cols_with_null_combined = combined_df.select_dtypes(include=np.number).columns[combined_df.select_dtypes(include=np.number).isnull().any()]
for col in numerical_cols_with_null_combined:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mean())

# Impute missing categorical values with the mode
categorical_cols_with_null_combined = combined_df.select_dtypes(include=['object']).columns[combined_df.select_dtypes(include=['object']).isnull().any()]
for col in categorical_cols_with_null_combined:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0])


# Total square footage
combined_df['TotalSF'] = combined_df['TotalBsmtSF'] + combined_df['1stFlrSF'] + combined_df['2ndFlrSF']

# Total bathrooms
combined_df['TotalBath'] = combined_df['FullBath'] + 0.5 * combined_df['HalfBath'] + \
                           combined_df['BsmtFullBath'] + 0.5 * combined_df['BsmtHalfBath']

# Age of house
combined_df['HouseAge'] = combined_df['YrSold'] - combined_df['YearBuilt']
combined_df['RemodAge'] = combined_df['YrSold'] - combined_df['YearRemodAdd']

# Total porch area
combined_df['TotalPorchSF'] = combined_df['OpenPorchSF'] + combined_df['EnclosedPorch'] + \
                               combined_df['3SsnPorch'] + combined_df['ScreenPorch']

# Has pool, garage, etc.
combined_df['HasPool'] = (combined_df['PoolArea'] > 0).astype(int)
combined_df['HasGarage'] = (combined_df['GarageArea'] > 0).astype(int)
combined_df['HasBsmt'] = (combined_df['TotalBsmtSF'] > 0).astype(int)
combined_df['HasFireplace'] = (combined_df['Fireplaces'] > 0).astype(int)

print(f"Shape after feature engineering: {combined_df.shape}")

Shape after feature engineering: (2919, 298)


In [17]:
# Identify categorical columns
categorical_cols_df = df.select_dtypes(include=['object']).columns
categorical_cols_dt = dt.select_dtypes(include=['object']).columns

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols_df, dummy_na=False)
dt_encoded = pd.get_dummies(dt, columns=categorical_cols_dt, dummy_na=False)

# Align columns - crucial for consistent features between train and test sets
train_cols = df_encoded.columns
test_cols = dt_encoded.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    dt_encoded[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    df_encoded[c] = 0

# Ensure the order of columns is the same
dt_encoded = dt_encoded[train_cols]

# Attempt to convert to int64 - Note: This may fail if there are non-integer float values
df_encoded = df_encoded.astype('int64', errors='ignore')
dt_encoded = dt_encoded.astype('int64', errors='ignore')

print("Training data after encoding, aligning, and attempted int64 conversion:")
display(df_encoded.head())
print("\nTest data after encoding, aligning, and attempted int64 conversion:")
display(dt_encoded.head())

Training data after encoding, aligning, and attempted int64 conversion:


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0



Test data after encoding, aligning, and attempted int64 conversion:


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [19]:
# Identify numerical columns with missing values in df_encoded
numerical_cols_with_null = df_encoded.select_dtypes(include=np.number).columns[df_encoded.select_dtypes(include=np.number).isnull().any()]

# Impute missing numerical values with the mean in df_encoded
for col in numerical_cols_with_null:
    if col != 'SalePrice': # Don't impute 'SalePrice' in the training data
        df_encoded[col] = df_encoded[col].fillna(df_encoded[col].mean())

# Identify numerical columns with missing values in dt_encoded
numerical_cols_with_null_dt = dt_encoded.select_dtypes(include=np.number).columns[dt_encoded.select_dtypes(include=np.number).isnull().any()]

# Impute missing numerical values with the mean in dt_encoded
for col in numerical_cols_with_null_dt:
     dt_encoded[col] = dt_encoded[col].fillna(dt_encoded[col].mean())


print("Missing values in df_encoded after imputation:")
display(df_encoded.isnull().sum()[df_encoded.isnull().sum() > 0])

print("\nMissing values in dt_encoded after imputation:")
display(dt_encoded.isnull().sum()[dt_encoded.isnull().sum() > 0])

Missing values in df_encoded after imputation:


Unnamed: 0,0



Missing values in dt_encoded after imputation:


Unnamed: 0,0


In [20]:
# Drop 'Id' as it's not a feature
X = df_encoded.drop(columns=['Id','SalePrice'])
# Apply log transformation to the target variable
y = np.log1p(df_encoded['SalePrice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## 1. Train a KNN Regressor

In [23]:
from sklearn.neighbors import KNeighborsRegressor
KNN = KNeighborsRegressor(n_neighbors=20)

KNN.fit(X_train,y_train)
knn_score = KNN.score(X_test,y_test)

score_list = {} # Initialize score_list
score_list["KNN Regressor"] = knn_score

print(f"Score is {knn_score}")

Score is 0.661759117342827


- Perform cross validation

In [24]:
from sklearn.model_selection import cross_val_score

scores_knn = cross_val_score(KNN, X, y, cv=5) # Use the KNN model and the full data (X, y) for cross-validation
print(f"Cross-validation scores for KNN Regressor: {scores_knn}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_knn.mean(), scores_knn.std()))

Cross-validation scores for KNN Regressor: [0.71530089 0.70320831 0.66768378 0.67204828 0.65856334]
0.68 accuracy with a standard deviation of 0.02


## 2. Train a SVM Regression

In [25]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train,y_train)
svr_score = svr.score(X_test,y_test)
score_list["SVR"] = svr_score
print(f"Score is {svr_score}")

Score is 0.7527629076777608


- Perform cross validation

In [26]:
from sklearn.model_selection import cross_val_score



scores_svr = cross_val_score(svr, X, y, cv=5)
print(f"Cross-validation scores for SVM Regressor: {scores_svr}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_svr.mean(), scores_svr.std()))

Cross-validation scores for SVM Regressor: [0.77416239 0.77071248 0.73289358 0.71420733 0.7033633 ]
0.74 accuracy with a standard deviation of 0.03


## 3. Train a Decision Tree Regression

In [28]:
from sklearn.tree import DecisionTreeRegressor # Changed from DecisionTreeClassifier
dtr = DecisionTreeRegressor(random_state=1) # Changed variable name to dtr
dtr.fit(X_train,y_train)

dtr_score = dtr.score(X_test,y_test) # Changed variable name to dtr_score
score_list["DTR"] = dtr_score # Changed key to DTR
print(f"Score is {dtr_score}")

Score is 0.7682978559951217


- Perform cross validation

In [30]:
from sklearn.model_selection import cross_val_score



scores_dtr = cross_val_score(dtr, X, y, cv=5) # Changed from dtc to dtr
print(f"Cross-validation scores for Decision Tree Regressor: {scores_dtr}") # Updated print statement
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_dtr.mean(), scores_dtr.std()))

Cross-validation scores for Decision Tree Regressor: [0.74735245 0.68975534 0.79098026 0.69707376 0.70759775]
0.73 accuracy with a standard deviation of 0.04


## 4. Train a Random Forest Regression

In [32]:
from sklearn.ensemble import RandomForestRegressor # Changed from RandomForestClassifier

rfc = RandomForestRegressor(n_estimators=50,random_state=1) # Changed from RandomForestClassifier
rfc.fit(X_train,y_train)
rfc_score = rfc.score(X_test,y_test)
score_list["RFC"]=rfc_score

print(f"Score is {rfc_score}")

Score is 0.8803575796123833


## 5. Compare all the performance of all regression models

In [33]:
score_list = list(score_list.items())

for alg,score in score_list:
    print(f"{alg} Score is {str(score)[:4]} ")

KNN Regressor Score is 0.66 
SVR Score is 0.75 
DTR Score is 0.76 
RFC Score is 0.88 


## 6. Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [38]:
# Ensure columns in X_test_submission match X_train
missing_cols = set(X_train.columns) - set(X_test_submission.columns)
for c in missing_cols:
    X_test_submission[c] = 0
# Ensure the order of columns is the same
X_test_submission = X_test_submission[X_train.columns]

y_pred = rfc.predict(X_test_submission)

# Apply inverse transformation to the predictions
y_pred = np.expm1(y_pred)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'Id': id,
    'SalePrice': y_pred # Changed column name to 'SalePrice' to match sample submission
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")

Submission file created: submission_file.csv
