<a href="https://www.kaggle.com/code/ibkya12/0-9042-house-price-pred-w-catboost?scriptVersionId=179768424" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
subsample = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
plt.figure(figsize=(18, 6))                                                                    

sns.histplot(train['SalePrice'], bins=100)                                                     
plt.xlabel('Selling price ($)', fontsize=14)                                                   
plt.ylabel('Frequency', fontsize=14)                                                           
plt.title('Saleprice Rate', fontdict={'fontsize': 11, 'fontweight': 'bold'})                  

plt.show();                                                                                     

print(train['SalePrice'].describe())                                                           

## Grouping NaN values with setted algorithm.

In [None]:
columns_to_impute_train = []
columns_to_remove_train = []

columns_to_impute_test = []
columns_to_remove_test = []

for column in train.columns:                                                       
    null_count = train[column].isnull().sum()                                      
    if null_count >= 500:                                                          
        columns_to_remove_train.append(column)
    elif null_count >= 1:                                                          
        columns_to_impute_train.append(column)
    else:                                                                          
        pass

for column in test.columns:                                                        
    null_count = test[column].isnull().sum()
    if null_count >= 500:
        columns_to_remove_test.append(column)
    elif null_count >= 1:
        columns_to_impute_test.append(column)
    else:
        pass

print("Columns to impute in train ", columns_to_impute_train, '\n')
print("Columns to remove in train: ", columns_to_remove_train, '\n')

print("Columns to impute in test: ", columns_to_impute_test, '\n')
print("Columns to remove in test: ", columns_to_remove_test)

In [None]:
train_clean = train.drop(columns_to_remove_train, axis=1)                                   
test_clean = test.drop(columns_to_remove_test, axis=1)                                      
print("No. of columns:",train_clean.shape[1]," - vs - ", train.shape[1])

In [None]:
cat_columns_train = train_clean.select_dtypes(include=['object'])                          
cat_columns_test = test_clean.select_dtypes(include=['object'])                            

print(cat_columns_train.columns)

In [None]:
label_encoder = LabelEncoder()                                                             

for columna in cat_columns_train.columns:                                                  
    train_clean[columna] = label_encoder.fit_transform(train_clean[columna])               

for columna in cat_columns_test.columns:
    test_clean[columna] = label_encoder.fit_transform(test_clean[columna])
    
train_clean.info()

In [None]:
knn_imputer_train = KNNImputer(n_neighbors=5, metric='nan_euclidean')                                           
knn_imputer_train.fit(train_clean[columns_to_impute_train])                                                     

train_clean[columns_to_impute_train] = knn_imputer_train.transform(train_clean[columns_to_impute_train])        


knn_imputer_test = KNNImputer(n_neighbors=5, metric='nan_euclidean')
knn_imputer_test.fit(test_clean[columns_to_impute_test])

test_clean[columns_to_impute_test] = knn_imputer_test.transform(test_clean[columns_to_impute_test])


print("No. of nulls in the dataset: ", train_clean.isnull().sum().sum())

In [None]:
correlations = train_clean.corr()

In [None]:
corr = train_clean.corr()                                               
corr_sale = corr['SalePrice'].sort_values(ascending=False)              

plt.figure(figsize=(18, 14))                                            

plt.barh(corr_sale.index, corr_sale.values)                             
plt.xlabel("Correlation", size=12)
plt.ylabel("")
plt.title("Relationship of the variables with  SalePrice", fontdict={'fontsize': 11, 'fontweight': 'bold'})
plt.gca().invert_yaxis()                                                

plt.show()

In [None]:
corr_matrix = train_clean.corr()

saleprice_corr = corr_matrix['SalePrice']

threshold = 0.50
high_corr_vars = saleprice_corr[abs(saleprice_corr) > threshold]


for var, corr_value in zip(high_corr_vars.index, high_corr_vars.values):
    print(f"{var} and SalePrice Correlation value: {corr_value:.2f}") 

In [None]:
street_count = train['Street'].value_counts()
print(street_count)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))                                                                  

sns.histplot(data=train_clean, x='OverallQual', ax=axes[0], bins=range(1, 11), kde=True)                         
axes[0].set_xlabel('Quality level', size=12)
axes[0].set_ylabel('Frecuency', size=12)
axes[0].set_title('Distribution of qualities', size=11, weight='bold')

sns.barplot(data=train_clean, x='OverallQual', y='SalePrice', ax=axes[1])                                        
axes[1].set_xlabel('Quality level', size=12) 
axes[1].set_ylabel('Selling price', size=12)
axes[1].set_title('Relationship between quality and sales price', size=11, weight='bold')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(data=train_clean, x='GrLivArea', ax=axes[0], bins=30, kde=True)                                       
axes[0].set_xlabel('Ft2', size=12)
axes[0].set_ylabel('Frecuency', size=12)
axes[0].set_title('Square footage distribution ', size=11, weight='bold')
 
sns.scatterplot(data=train_clean, x='GrLivArea', y='SalePrice', ax=axes[1])                                           
axes[1].set_xlabel('Ft2', size=12)
axes[1].set_ylabel('Selling price', size=12)
axes[1].set_title('Relationship between square footage and sales price', size=11, weight='bold')

plt.tight_layout()
plt.show()

## GarageCars with Ft^2 Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 6))                                                                   

sns.histplot(data=train_clean, x='GarageCars', ax=axes[0], bins=range(6), kde=True)                               
axes[0].set_xlabel('Parking spaces', size=12)
axes[0].set_ylabel('Frecuency', size=12)
axes[0].set_title('distribution of the number of parking spaces ', size=11, weight='bold')

sns.histplot(data=train_clean, x='GarageArea', ax=axes[1], bins=30, kde=True)                                     
axes[1].set_xlabel('Ft2', size=12)
axes[1].set_ylabel('Frecuency', size=12)
axes[1].set_title('Square footage distribution', size=11, weight='bold')

sns.scatterplot(data=train_clean, x='GarageCars', y='GarageArea', hue='SalePrice', ax=axes[2])                    
axes[2].set_xlabel('Parking spaces', size=12)
axes[2].set_ylabel('Ft2', size=12)
axes[2].set_title('Relationship between number of seats, size and selling price', size=11, weight='bold')

plt.tight_layout()
plt.show()

## TotalBstmSF with FT^2 Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(data=train_clean, x='TotalBsmtSF', ax=axes[0], bins=30, kde=True)                                    
axes[0].set_xlabel('Ft2', size=12)
axes[0].set_ylabel('Frecuency', size=12)
axes[0].set_title('Square footage distribution', size=11, weight='bold')

sns.scatterplot(data=train_clean, x='TotalBsmtSF', y='SalePrice', ax=axes[1])                                     
axes[1].set_xlabel('Ft2', size=12)
axes[1].set_ylabel('Selling price', size=12)
axes[1].set_title('Selling Price and TotalBstmF', size=11, weight='bold')

plt.tight_layout()
plt.show()

## Linear Regression Model Predict

In [None]:
X_train = train_clean[['OverallQual']]
y_train = train_clean['SalePrice']
X_test = test_clean[['OverallQual']]

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

simple_predictions = lr_model.predict(X_test)

rmse_simple = np.sqrt(mean_squared_error(y_train, lr_model.predict(X_train)))
r2_simple = r2_score(y_train, lr_model.predict(X_train))

## Simple Linear Regression Predict Visualization

In [None]:
plt.figure(figsize=(19, 6))
plt.scatter(y_train, lr_model.predict(X_train), alpha=0.6)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
plt.xlabel('')
plt.ylabel('')
plt.title('Simple Linear Regression')
plt.show()


print(f"Simple Linear Regression RMSE: {rmse_simple}")
print(f"Simple Linear Regression  R-squared: {r2_simple}")

## Multiple Linear Regression Model Predict

In [None]:
X = train_clean.drop(columns=['SalePrice'])
y = train_clean['SalePrice']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)                                 

y_pred = model.predict(X_test)

test_clean_aligned = test_clean[X.columns]
test_clean_scaled = scaler.transform(test_clean_aligned)
test_predictions = model.predict(test_clean_scaled)

rmse_multiple = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred)))
r2_multiple = r2_score(y_test, y_pred)

## Multiple Linear Regression Predict Visuaization

In [None]:
plt.figure(figsize=(19, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('')
plt.ylabel('')
plt.title('Multiple Linear Regression')
plt.show()

print(f"Multiple Linear Regression RMSE: {rmse_multiple}")
print(f"Multiple Linear Regression R-squared: {r2_multiple}")

## Ridge Regressor Predict

In [None]:
ridge = Ridge()
ridge_params = {'alpha': [0.01, 0.1, 1, 10,100]}
ridge_grid = GridSearchCV(ridge, ridge_params, cv=10, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
best_ridge = ridge_grid.best_estimator_

y_pred_ridge = best_ridge.predict(X_test)
test_predictions_ridge = best_ridge.predict(test_clean_scaled)

rmse_ridge = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_ridge)))
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression RMSE: {rmse_ridge}")
print(f"Ridge Regression R-squared: {r2_ridge}")

## Lasso Regressor Predict

In [None]:
lasso = Lasso(max_iter=20000)
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
best_lasso = lasso_grid.best_estimator_

y_pred_lasso = best_lasso.predict(X_test)
test_predictions_lasso = best_lasso.predict(test_clean_scaled)

rmse_lasso = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_lasso)))
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression RMSE: {rmse_lasso}")
print(f"Lasso Regression R-squarede: {r2_lasso}") 

## k-NN Regressor Predict

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

knn_rmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_knn)))
knn_r2 = r2_score(y_test, y_pred_knn)

print(f"KNN Regression RMSE: {knn_rmse}")
print(f"KNN Regression R-squared: {knn_r2}")

## ElasitcNet Regressor Model Predict

In [None]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_net.fit(X_train, y_train)


y_pred_en = elastic_net.predict(X_test)


en_rmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_en)))
en_r2 = r2_score(y_test, y_pred_en)

print(f"ElasticNet RMSE: {en_rmse}")
print(f"ElasticNet R-squared: {en_r2}")

## Gradient Boosting Regressor Model Predict

In [None]:
gb = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)


y_pred_gb = gb.predict(X_test)


gb_rmse = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_gb)))
gb_r2 = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting RMSE: {gb_rmse}")
print(f"Gradient Boosting R-squared: {gb_r2}")

## Category Boosting Regressor Model Predict [BEST SCORE]

In [None]:
catb = CatBoostRegressor()
catb_model = catb.fit(X_train, y_train,
                     verbose = 0)

y_pred_catb = catb_model.predict(X_test)

catb_rmse_calculator = np.sqrt(mean_squared_error(np.log(y_test), np.log(y_pred_catb)))
catboost_r2_metric = r2_score(y_test, y_pred_catb)

print(f"Category Boosting RMSE Metric: {catb_rmse_calculator}")
print(f"Category Boosting R-squared Metric: {catboost_r2_metric}")

## Submission Best Model [CatBoost Regressor] Predicts

In [None]:
test_pred_catboost = catb.predict(test_clean_scaled)

submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_pred_catboost
})

print(submission.head())
submission.to_csv('/kaggle/working/submission.csv', index=False)