## House Prices - Advanced Regression Techniques
The notebook will present an exploratory data analysis of the dataset House Prices

In [None]:
#!pip install pandas
#!pip install seaborn
#!pip install autogluon

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


house_train_df = pd.read_csv(filepath_or_buffer="../data/raw_data/train.csv")

print(house_train_df.head())



In [None]:
house_train_df.shape

In [None]:
## columns with na values
nacolumns = house_train_df.isna().any()
print (nacolumns[nacolumns])

In [None]:
# remove id column
house_train_df = house_train_df.drop(columns="Id")

In [None]:
# distribution of house price
house_train_df['SalePrice'].hist(figsize=(4, 4), bins=50, xlabelsize=8, ylabelsize=8)

In [None]:
# Numerical data
df_num = house_train_df.select_dtypes(include = ['float64', 'int64'])


In [None]:
df_num.head()

In [None]:
# histograms to show distributions of numerical data
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

List of columns where NA means no feature available, information available
- Alley NA =>> no alley
- BsmtQual NA =>> no basement
- BsmtCond NA =>> no basement (Shall we check that BsmtCond is consistent with BsmtQual?)
- BsmtFinType1
- BsmtFinType2
- BsmtExposure => no exposure
- FireplaceQu
- GarageType
- GarageFinish
- GarageQual
- GarageCond
- PoolQC
- Fence
- MiscFeature

List of columns where na means no information available
- LotFrontage
- Electrical
- MasVnrArea
- GarageYrBlt

In [None]:
## replace na with mean
house_train_df['LotFrontage'].fillna(house_train_df['LotFrontage'].mean(), inplace = True)
house_train_df['MasVnrArea'].fillna(house_train_df['MasVnrArea'].mean(), inplace = True)
nacolumns = house_train_df.isna().any()
print (nacolumns[nacolumns])



In [None]:
## The remaining columns including NA are real NAs. Hot encoding of the remaining columns

#print(house_train_df.shape)
#for col in house_train_df.columns[nacolumns]:
#    house_train_df = pd.concat([house_train_df, pd.get_dummies(house_train_df[col], dummy_na=True, prefix=col)],axis=1)
#
print(house_train_df.shape)


In [None]:

house_train_df.describe()
house_train_df.head()

# Let's move to the model training


In [None]:
# split of data into training and test set
from sklearn.model_selection import train_test_split

train, test= train_test_split(df_num, test_size= 0.25, random_state = 42) 


### We will try XGBoost
no need to feature scaling with xgboost

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 30, n_estimators = 100)

X_train = train.drop(columns = ['SalePrice'])
X_test =  test.drop(columns =  ['SalePrice'])

y_train = train['SalePrice']
y_test =  test['SalePrice']

model.fit(train.drop(columns = ['SalePrice']), train['SalePrice'])

In [None]:
result = model.score(X_test, y_test)
print("Accuracy : {}".format(result))

### Go to prediction:

In [None]:
test_df = pd.read_csv(filepath_or_buffer="../data/raw_data/test.csv")

# apply same transformations
test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean(), inplace = True)
test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mean(), inplace = True)
test_id_df = test_df['Id']
test_df = test_df.drop(columns="Id")
print(test_id_df)


## Use autogluon

In [None]:
# Train with multiple algorithms
from autogluon.tabular import TabularDataset, TabularPredictor
predictor = TabularPredictor(label="SalePrice", problem_type='regression' , eval_metric = 'root_mean_squared_error').fit(train_data = train, time_limit = 200, presets = "good_quality", num_gpus=0)


In [None]:
predictor.leaderboard()

In [None]:
import matplotlib.pyplot as plt

y_predict = predictor.predict(X_test)
plt.figure(figsize = (15, 10))
plt.plot(y_test, y_predict, "^", color = 'r')
plt.ylabel('Model Predictions')
plt.xlabel('True Values')

In [None]:
results = predictor.predict(test_df)

In [None]:
# ouput results
import datetime
results.index = test_id_df

# Get the current date in ISO format
current_date = datetime.date.today().isoformat()

filename = f"../outputs/outputs_{current_date}.csv"

print(filename)
# Save results as a CSV file with the current date in the filename
results.to_csv(f"../outputs/outputs_{current_date}.csv")

results.to_csv("../outputs/outputs.csv", )