# Regression Model Steps (Preprocessing & Modeling):

### 1. Evaluation & Cleaning of Data
### 2. Feature Transformation
### 3. Encoding
### 4. Scaling
### 5. Target Transformation
### 6. Model Selection & Evaluation
### 7. Submission

In [None]:
!pip install -q pycaret #for model selection library

In [None]:
# import useful packages
import numpy as np # linear algebra
import pandas as pd # data processing 

pd.set_option('max_columns', None) # exapnd column width
pd.set_option('max_rows',81) 

from sklearn.neighbors import KNeighborsRegressor #K&N for numeric missing values
import scipy.stats #Feature Transformation - will only apply for numeric features
from sklearn.preprocessing import StandardScaler 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from pycaret.regression import setup, compare_models #setup function will perform essential inferences about the data

#import models after comparison - can run different ones if needed 
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold, cross_val_score #cross validation for models

In [None]:
train0 = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test0 = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

# Data Evaluation

In [None]:
train0

In [None]:
#use correlation heatmap viz to help see which features could be useful for a stronger correlation to salesprice:
#OverallQual / GrLvngArea / 

plt.figure(figsize=[40,30])
sns.heatmap(train0.corr(), annot = True,cmap = 'coolwarm')
plt.xticks(fontsize=20,weight = 'bold')
plt.yticks(fontsize=20,weight = 'bold')

In [None]:
test0

In [None]:
sample_submission #End Submission Format

# Combine Train & Test Set
#### preprocess all data together then split before modeling

In [None]:
train0.columns

In [None]:
test0.columns

In [None]:
#combine test and training set to get more valuable data for pre-processing.  Drop columns not needed before filling for NaN values 
target = train0 ['SalePrice']
test_ids = test0['Id']

train1 = train0.drop(['Id', 'SalePrice'], axis=1)
test1 = test0.drop (['Id'], axis=1)


data1 = pd.concat([train1, test1], axis=0).reset_index(drop=True)
data1 #combined data

In [None]:
data2 = data1.copy()

# Cleaning

* Remove duplicate or irrelevant observations. Remove unwanted observations from your dataset, including duplicate observations or irrelevant observations
* **Fix structural errors**
* Filter unwanted outliers
* **Handle missing data**
* Validate and QA

## Ensure proper data types

In [None]:
data1.select_dtypes(np.number)#Look at all the numerical data types from combined data 
#We are looking for any numerical data type that is intended to be categorical - want to set these as strings

In [None]:
#Set MSSubClass As String
data2['MSSubClass'] = data2['MSSubClass'].astype(str)

## Fill Categorical Missing Values

In [None]:
data2.select_dtypes('object').columns #verify MSSubClass is now an object column 

In [None]:
#Find categorial columns with missing values
data2.select_dtypes('object').loc[:,data2.isna().sum()>0].columns

In [None]:
#Look at data descriptions to figure out when 1: need to fill with the mode or 2: when missing value is intended to mean something

#impute using column mode
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

#2:impute using constant value
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]:
    data2[column] = data2[column].fillna("None") 

In [None]:
data2.select_dtypes('object').loc[:,data2.isna().sum() > 0].columns 
#all object columns with at least one missing value for above placement ^^ -- match with data descriptions also use to validate

In [None]:
#look at missing values across all columns
data2.select_dtypes(np.number).isna().sum().sum()

In [None]:
#make a copy of data2
data3=data2.copy()

## Deal with numeric missing values

In [None]:
#Use KNeighbors reggressor: when you have a missing value it looks at surrounding data points determine what the number should be
#look at numeric fields with missing values:
data3.select_dtypes(np.number).isna().sum()

In [None]:
#Create KNN impute function that will take in the dataframe and the column and return the same dataframe with the columns missing value filled in
def knn_impute (df, na_target):
    df=df.copy() #copy of the df so the function does not modify it in place
    
    
    numeric_df = df.select_dtypes(np.number) #all numeric columns
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns #gives us the columns with no missing values 
    
    #create temp testset that uses all the rest of the data as "neighbors" for missing values
    y_train = numeric_df.loc[data3[na_target].isna() ==False, na_target] #logic to all values for NA target that do NOT have missing values in columns -- target data for regressor
    x_train = numeric_df.loc[data3[na_target].isna() ==False, non_na_columns] #same logic but filtered to all the rest of the data 
    x_test = numeric_df.loc[data3[na_target].isna() ==True, non_na_columns] #x_test to find columns with missing values in na_target 
    
    
    #create actual regressor
    knn = KNeighborsRegressor()
    knn.fit(x_train, y_train) #fit similar data to use for prediction
    
    y_pred = knn.predict(x_test) #the values we will be using for imputation will come from this prediction
    
    df.loc[numeric_df[na_target].isna()==True, na_target] = y_pred #set all missing values in Na_Target using the predict logic
    
    return df

In [None]:
knn_impute(data3, 'LotFrontage').isna().sum() #test to verify values for a numeric column have been filled in and regressor is working as intended

In [None]:
data3.columns[data3.isna().sum() >0] #return all columns with missing values

In [None]:
for column in [
    'LotFrontage', 
    'MasVnrArea', 
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath', 
    'GarageYrBlt',
    'GarageCars', 
    'GarageArea'
]:
    data3 = knn_impute(data3, column)

In [None]:
data3.columns.isna().sum() #No more missing values

In [None]:
data4 = data3.copy()

# Feature Transformation
### (modifying data but keeping the information)

## Log Transformation To Normalize Data distributions

#### Log functions work better when normalizing data distributions due to being able to easily unconvert and draw conclusions with the data.
#### This does not work the same way with square rt / recipricols

In [None]:
#The reason to do feature transformations is because cerain models will perform better when the data it's taking is is distributed.. which is not always the case with featured data. 
#What you can do is look at the scew of a column to see where the mean is leaning. 
#We want to perform feature transformation to help correct the scew of a variable by applying a transformation to it. Will use scipy.stats for this

scipy.stats.skew(data4['LotFrontage']) #this will return the scew for a given column.  Any value +/- 0.5 you can consider data as scewed and in need of transformation
    # 0 - no scew data is perfectly distributed
    # postive - right scewed
    #negative - left scewed

In [None]:
data4.select_dtypes(np.number).columns

In [None]:
#logic to find the skew for each number column - User this to identify which columns need transformation
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns, columns=["Feature"])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Abs Skew'] = skew_df['Skew'].apply(abs) #doesn't matter what direction the skew is going so look at abs
skew_df['Skewed'] = skew_df['Abs Skew'].apply(lambda x: True if x >= 0.5 else False) #create column to see if data is skewed
skew_df

In [None]:
#return all columns with skewed data based on aboive function
skew_df.query("Skewed == True")['Feature'].values

In [None]:
#get stats for skewed data
data4[skew_df.query("Skewed == True")['Feature'].values].describe()

#issue being that we have min values at 0.. the logarithm function is undefined at 0. A useful trick for this is x+1 where the min = 0 

In [None]:
#numpy already has a built inlog1p function for this that will evaluate the value very close to 0
for column in skew_df.query("Skewed == True")['Feature'].values:
    data4[column] = np.log1p(data4[column])#applying log transformation to each for the sckewed columns 

In [None]:
#rerun logic to find the skew for each number column - Use this to identify which columns need transformation
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns, columns=["Feature"])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs) #doesn't matter what direction the skew is going so look at abs
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False) #create column to see if data is skewed
skew_df

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(data3['LotFrontage'], kde=True, fit =scipy.stats.norm) #density estimator
plt.title("B4 Log Transformation")

plt.subplot(1, 2, 2)
sns.distplot(data3['GrLivArea'], kde=True, fit =scipy.stats.norm) #density estimator
plt.title("B4 Log Transformation")

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(data4['LotFrontage'], kde=True, fit =scipy.stats.norm) #density estimator
plt.title("With Log Transformation")

plt.subplot(1, 2, 2)
sns.distplot(data4['GrLivArea'], kde=True, fit =scipy.stats.norm) #density estimator
plt.title("With Log Transformation")

In [None]:
data5 = data4.copy()

# Encoding
#### Used to add categorical columns to model

In [None]:
#The get_dummies() function is used to convert categorical variable into dummy/indicator variables. Data of which to get dummy indicators

data5 = pd.get_dummies(data5)

In [None]:
data4.count().count()

In [None]:
data5.count().count()

In [None]:
data6 = data5.copy()

# Feature Scaling

### ML algorithms and regression models typically calucate the distance between all of the data (seperate from transformation which deals with the distribution of a single DS)
### All features should be normalized so that each individual feature contributes an equal proportion to the model (fruit blender analogy)


In [None]:
#Research Scaling
#This centers all the columns at 0 with a variance of 1. 
#Prediction models work more effectively when all of the features can fit on one scale

scaler = StandardScaler()
scaler.fit(data6)
data6 = pd.DataFrame(scaler.transform(data6), index=data6.index, columns=data6.columns)

In [None]:
data5

In [None]:
data6

In [None]:
data7 = data6.copy()

# Target Transformation 
#### Same as feature transformation but for our target variable

In [None]:
#Check to make sure min target values are above 0 before trasnformation
np.min(target)

In [None]:
#Target unit tranformation has to be done seperate from feature transformation because when you trasnform the target you're essentially changing the unit 
#that the model is using for the predictions.. so if you want to analyze results be sure to undo your target trasnformations

#First look at how target data is distributed
#This chart will show how a normalized distribution will fit with the data
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(target, kde=True, fit =scipy.stats.norm) #density estimator
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log(target), kde=True, fit =scipy.stats.norm) #density estimator
plt.xlabel("Log Sale Price")
plt.title("With Log Transform")
plt.show()

In [None]:
#Since we've validated that we can better distribute our target data, we will apply the transformation below
log_target = np.log(target)
log_target

# Split Data

In [None]:
#We'll want to split the data so we know which rows our model is using for the train & test version
#goal is to use the train set for the model and evaluate on the test set
train_final = data7.loc[:train0.index.max(),:].copy()
test_final = data7.loc[train0.index.max()+1:,].reset_index(drop = True).copy()

# Model Selection

In [None]:
log_target

In [None]:
#Data to run through model selection
pd.concat([train_final, log_target], axis=1)

In [None]:
#setup function for model selection
_ =setup(data=pd.concat([train_final, log_target], axis=1), target='SalePrice')

In [None]:
compare_models()
#each model takes a different approach at solving the prediction most are regression models. Ideally looking for top reccomendation and lowest RMSE

# Baseline Model

In [None]:
baseline_model = CatBoostRegressor(verbose=0)

In [None]:
baseline_model.fit(train_final, log_target)

# Evaluate

In [None]:
#logic to cross validate prediction results against the train_final
#Will make 10 different splits, cross_val score will return results from each run --> results will contain a list of NMSE from the cross validations
Kf = KFold(n_splits=10)

results = cross_val_score(baseline_model, train_final, log_target, scoring='neg_mean_squared_error', cv=Kf)

In [None]:
results

In [None]:
#get the mean sale price from model.  Do this by un-transforming the target metric 
np.exp(np.sqrt(np.mean(-results)))

In [None]:
#Validate forecast model against training actuals

Train_Prediction = np.exp(baseline_model.predict(train_final)) #remember to un-transform values for predictions

tp = pd.concat([test_ids,target,pd.Series(Train_Prediction, name = 'SalePricePredict')],axis=1)
#Initial model results look very good.  We have a RMSE *avg error against prediction & actuals* of 1.12 out of a very wide range of sales prices

print(tp)

plt.figure(figsize=(40,15))

x = tp.Id
y = tp.SalePrice

x2 = tp.Id
y2 = tp.SalePricePredict

plt.plot(x, y, label='Actual')
plt.plot(x2, y2, label='Prediction')
plt.legend(loc='upper center')

# Submission

In [None]:
sample_submission

In [None]:
final_predictions = np.exp(baseline_model.predict(test_final)) #remember to un-transform values for predictions

submission = pd.concat([test_ids, pd.Series(final_predictions, name = 'SalePrice')], axis=1)

In [None]:
submission

In [None]:
submission.to_csv('./submission.csv',index=False, header=True)