In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('/content/House Prices.csv')
df

In [None]:
df.info()

In [None]:
drop_cols = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']

In [None]:
from scipy import stats

plt.subplots(figsize = (12, 9))
sns.distplot(df['SalePrice'], fit = stats.norm)

(mu, sigma) = stats.norm.fit(df['SalePrice'])

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')

In [None]:
df['SalePrice'] = np.log1p(df['SalePrice'])

plt.subplots(figsize = (12, 9))
sns.distplot(df['SalePrice'], fit = stats.norm)

(mu, sigma) = stats.norm.fit(df['SalePrice'])

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')

In [None]:
Isnull = df.isnull().sum()/len(df)*100
Isnull = Isnull[Isnull>0]
Isnull.sort_values(inplace = True, ascending = False)
Isnull

In [None]:
Isnull = Isnull.to_frame()
Isnull.columns = ['count']
Isnull.index.names = ['Name']

Isnull['Name'] = Isnull.index

plt.figure(figsize = (13, 5))
sns.set(style = 'whitegrid')
sns.barplot(x = 'Name', y = 'count', data = Isnull)
plt.xticks(rotation = 90)
plt.show()

In [None]:
df = df.drop(columns = drop_cols)
df

In [None]:
from sklearn.impute import SimpleImputer

numeric_columns = df.select_dtypes(include=['float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values in numeric columns with mean
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

# Impute missing values in categorical columns with mode
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

df

In [None]:
df.info()

In [None]:
df_corr = df.select_dtypes(include = [np.number])
df_corr.shape

df_corr.drop(columns = 'Id')

In [None]:
corr = df_corr.corr()
plt.subplots(figsize = (20, 9))
sns.heatmap(corr, annot = True)

In [None]:
thres = (corr['SalePrice'] > 0.5) | (corr['SalePrice'] < -0.5)
top_feature = corr.index[abs(thres)]

plt.subplots(figsize = (20, 8))
top_corr = df[top_feature].corr()
sns.heatmap(top_corr, annot = True)
plt.show()

In [None]:
print('Find most important features relative to target')
corr = df.corr()
corr.sort_values(['SalePrice'], ascending = False, inplace = True)
corr.SalePrice

In [None]:
df.columns

In [None]:
df['MiscFeature'] = df['MiscFeature'].fillna('None')
df['Alley'] = df['Alley'].fillna('None')
df['Fence'] = df['Fence'].fillna('None')
df['FireplaceQu'] = df['FireplaceQu'].fillna('None')

In [None]:
# Garatgetype, GarageFinish, GarageQual and Garagecond these are replacing with none
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
  df[col] = df[col].fillna('None')

# GarageYrBlt, GarageArea and GarageCars these are replacing with zero
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
  df[col] = df[col].fillna(int(0))

# # BsmtFinType2, BsmtExposure, BsmtFinType1, BsmtCond, BsmtQual, these are replacing with None
for col in ['BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual']:
  df[col] = df[col].fillna('None')

df['Electrical'] = df['Electrical'].fillna(df['Electrical']).mode()[0]
df['MasVnrArea'] = df['MasVnrArea'].fillna(int(0))
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())

In [None]:
# df_categ =  df.select_dtypes(include=['object'])
catFeatures = [col for col in df.columns if col in df.select_dtypes(include=['object']).columns]

le = LabelEncoder()

for col in catFeatures:
  df[col] = le.fit_transform(df[col])

In [None]:
y = df['SalePrice']

X = df.drop('SalePrice', axis = 1).values

y = y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7)

## LINEAR REGRESSOR: Accuracy --> 88.99


In [None]:
from sklearn import linear_model
model = linear_model.LinearRegression()

model.fit(X_train, y_train)

print("Predict value: " + str(model.predict([X_test[142]]).round(2)))
print("Real value: " + str(y_test[142].round(2)))

print('Accuracy: ', (model.score(X_test, y_test)*100).round(2))

## RANDOM FOREST REGRESSOR: Accuracy --> 89.74

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 1000)

model.fit(X_train, y_train)

print("Predict value: " + str(model.predict([X_test[142]]).round(2)))
print("Real value: " + str(y_test[142].round(2)))

print('Accuracy: ', (model.score(X_test, y_test)*100).round(2))

## GRADIENT BOOSTING REGRESSOR: (TOP) Accuracy --> 92.11

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(n_estimators = 100, max_depth = 4)

GBR.fit(X_train, y_train)

print("Predict value: " + str(GBR.predict([X_test[142]]).round(2)))
print("Real value: " + str(y_test[142].round(2)))

print('Accuracy: ', (GBR.score(X_test, y_test)*100).round(2))

## Summary Report

We have used 3 regression models:

**1)** `LINEAR REGRESSOR`

**2)** `RANDOM FOREST REGRESSOR`

**3)** `GRADIENT BOOSTING REGRESSOR`

but among all, the most accuracy is in the `GradientBoostingRegressor`