### Load Packages:

In [1]:
# Numpy will be used for Linear Algebra
import numpy as np

# Pandas will be used for DataFrames
import pandas as pd
# Display all Columns 
pd.set_option('display.max_columns', None)

# Matplotlib for Visualization
import matplotlib.pyplot as plt
# Display plots in notebook
%matplotlib inline

# Seaborn for easier Visualization
import seaborn as sns
# Change color scheme
sns.set_style("whitegrid")

# Sys for size of Dataset
import sys

### Sklearn Packages:

In [2]:
# Import r2_score and mean_absolute_error functions
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error

# Preprocessing
from sklearn.preprocessing import StandardScaler,Imputer,PolynomialFeatures

# Model Selection
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,StratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge,RidgeCV,Lasso,LassoCV,ElasticNet,ElasticNetCV

# Tree
from sklearn.tree import DecisionTreeRegressor

# Ensemble Methods
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor,GradientBoostingRegressor

# Pipelines
from sklearn.pipeline import Pipeline,make_pipeline

# Feature Selection
from sklearn.feature_selection import VarianceThreshold,SelectKBest

## 1. Load the Cleaned Data

In [3]:
# Load training set into a pandas DataFrame
train = pd.read_csv('../Datasets/cleaned_train.csv')

# Load the testing set into a pandata DataFrame
test = pd.read_csv("../Datasets/cleaned_test.csv")

## 2. Identify Features to use

In [4]:
# Create a list that contains all the train columns names
total_train_columns = train.columns.tolist() 
# Create a list that contains all the test columns names
total_test_columns = test.columns.tolist() 
# Create a list that contains all the train and test columns names
train_test_columns = train.columns.tolist() + test.columns.tolist()

In [5]:
# Print the length of the total train columns
print("Length of total train columns:",len(total_train_columns))
# Print the length of the total test columns
print("Length of total train columns:",len(total_test_columns))
# Print the length of the total train and test columns
print("Length of total train columns:",len(train_test_columns))

Length of total train columns: 190
Length of total train columns: 174
Length of total train columns: 364


In [6]:
# Only use columns that are contained in both training and testing set
features_to_use = set.intersection(set(total_train_columns), set(total_test_columns))

In [7]:
# My features will not contain SalePrice or ID
features_to_use = [col for col in features_to_use if (col != 'SalePrice') | (col != 'Id')]

In [8]:
# DataFrame of Train and Test Features to use
train_features = train.loc[:,features_to_use]
test_features = test.loc[:,features_to_use]

In [9]:
# Print the top 25 absolute correlations
train.corr()["SalePrice"].apply(lambda x: abs(x)).sort_values(ascending = False).head(10)

SalePrice      1.000000
OverallQual    0.806139
ExterQual      0.717360
GrLivArea      0.707484
KitchenQual    0.693939
GarageArea     0.653927
TotalBsmtSF    0.650375
Bath           0.646339
1stFlrSF       0.631378
BsmtQual       0.623871
Name: SalePrice, dtype: float64

## 3. Identify the Target Variable

In [10]:
# Name the target variable
target = train['SalePrice']

# Create the log transformation of SalePrice
target_log = np.log1p(train['SalePrice'])

## 4. Split and Scale the data

#### A. Split into Train and Holdout set

In [11]:
# X will be all features in the training set excluding the target variable

features = ['OverallQual', 'ExterQual','GrLivArea', 'KitchenQual', 'GarageArea','TotalBsmtSF','Bath','1stFlrSF']

X = train_features[features]


# Y will represent the target variable
y = target_log

# Train test split and use holdout set because we have an actual testing set
X_train,X_holdout,y_train,y_holdout = train_test_split(X,y,random_state = 42)

#### B. Scale the data

In [12]:
# Instantiate a StandardScaler object
ss= StandardScaler()

# Fit the Scaler on the training data
ss.fit(X_train)

# Transform training scaled and holdout scaled set
X_train_scaled = ss.transform(X_train)
X_holdout_scaled = ss.transform(X_holdout)

## 5. Modeling

In [13]:
# Function to calculate root mean square error for each model
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)

### Phase 1: Regularized Regression Models

#### A. Simple Linear Regression

In [14]:
lr = LinearRegression()
lr.fit(X_train_scaled,y_train)
print("Linear Regression Scores:",cross_val_score(lr,X_train_scaled,y_train,cv=10))
print("Mean Score for Linear Regression:",np.mean(cross_val_score(lr,X_train_scaled,y_train,cv=10)))
print("The root mean square error for Linear Regression is:",rmse_cv(lr).mean().round(4))

Linear Regression Scores: [0.77153486 0.87156474 0.85406164 0.85594019 0.79585892 0.88793453
 0.83520211 0.63518659 0.79844283 0.87463172]
Mean Score for Linear Regression: 0.8180358129807287
The root mean square error for Linear Regression is: 0.1696




##### Using the above features we can accurately guess the SalePrice 81% of the time based on the Linear Regression model

#### Note: Since I will be using the same process I will write a function

In [15]:
def the_models_selection(model_name):
    """This function prints out the scores for the selected model"""
    
    # Instantiate the model
    mn = model_name
    
    # Fit the model
    mn_model = mn.fit(X_train_scaled,y_train)
    
    # Print out the name of Model being used
    print(mn)
    
    # Score the model
    mn_score_train = mn_model.score(X_train_scaled,y_train)
    mn_score_test = mn_model.score(X_holdout_scaled,y_holdout)
    
    # Root Mean Squared Error
    rmse= np.sqrt(-cross_val_score(mn, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv = 10))
    
    # Print out the scores mean
    print("Train Score:",mn_score_train)
    print("Holdout Score:",mn_score_train)
    print("RMSE:",rmse.mean().round(4))
    

#### B. Ridge Regression

In [16]:
the_models_selection(RidgeCV(alphas=np.logspace(0,5,200),cv=10))

RidgeCV(alphas=array([1.00000e+00, 1.05956e+00, ..., 9.43788e+04, 1.00000e+05]),
    cv=10, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=False)
Train Score: 0.8250176341308525
Holdout Score: 0.8250176341308525
RMSE: 0.1694


#### C. Lasso Regression

In [17]:
the_models_selection(LassoCV(n_alphas=524,cv=10))

LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=524, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)
Train Score: 0.8253221037290417
Holdout Score: 0.8253221037290417
RMSE: 0.1697


#### D. ElasticNet Regression

In [18]:
the_models_selection(ElasticNetCV(l1_ratio=np.linspace(.01, 1.0, 29),n_alphas = 25,cv = 10))

ElasticNetCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
       l1_ratio=array([0.01   , 0.04536, 0.08071, 0.11607, 0.15143, 0.18679, 0.22214,
       0.2575 , 0.29286, 0.32821, 0.36357, 0.39893, 0.43429, 0.46964,
       0.505  , 0.54036, 0.57571, 0.61107, 0.64643, 0.68179, 0.71714,
       0.7525 , 0.78786, 0.82321, 0.85857, 0.89393, 0.92929, 0.96464,
       1.     ]),
       max_iter=1000, n_alphas=25, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)
Train Score: 0.8251078427361072
Holdout Score: 0.8251078427361072
RMSE: 0.1695


### Phase 2: Pipelines


In [19]:
# X will be all features in the training set excluding the target variable

features = ['OverallQual', 'ExterQual','GrLivArea', 'KitchenQual', 'GarageArea','TotalBsmtSF','Bath','1stFlrSF']

X = train_features[features]


# Y will represent the target variable
y = target_log

# Train test split and use holdout set because we have an actual testing set
X_train,X_holdout,y_train,y_holdout = train_test_split(X,y,random_state = 42)

#### A. Lasso

In [20]:
# Name variables
imp = Imputer()
ss = StandardScaler()
lasso = Lasso(max_iter=5000)

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('lasso', lasso)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'lasso__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] # Tune hyperparameters for lasso alpha
}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

Best Score: 0.8181212566485693
Best Params: {'imp__strategy': 'mean', 'lasso__alpha': 0.005}


#### B. Ridge

In [None]:
# Name variables
imp = Imputer()
ss = StandardScaler()
ridge = Ridge(max_iter=5000)

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('ridge', ridge)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'ridge__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] # Tune hyperparameters for ridge alpha
}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

Best Score: 0.8181659330479772
Best Params: {'imp__strategy': 'mean', 'ridge__alpha': 10}


#### C. ElasticNet

In [None]:
# Name variables
imp = Imputer()
ss = StandardScaler()
elasticnet = ElasticNet(max_iter=5000)

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('elasticnet', elasticnet)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'elasticnet__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], # Tune hyperparameters for elasticnet alpha
    'elasticnet__l1_ratio': [0.1, 0.5, 0.09] # Tune hyperparameters for l1 ratio
  
}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

#### D. RandomForest

In [None]:
# Name variables
imp = Imputer()
ss = StandardScaler()
randomforest = RandomForestRegressor(verbose = 1, n_jobs=-1)

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('randomforest', randomforest)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'randomforest__n_estimators': [25,50,200], # Tune hyperparameters for n_estimators
    'randomforest__max_depth': [80,50,200], # Tune hyperparameters for max_depth
    'randomforest__max_features': ['auto','sqrt'] # Tune hyperparameters for max_features

}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

#### E. GradientBoost

In [None]:
# Name variables
imp = Imputer()
ss = StandardScaler()
gradientboost = GradientBoostingRegressor()

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('gradientboost', gradientboost)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'gradientboost__n_estimators': [25,50,200], # Tune hyperparameters for n_estimators
    'gradientboost__learning_rate': [.05,.2,.5], # Tune hyperparameters for learning_rate
    'gradientboost__max_depth': [1,5], # Tune hyperparameters for max_depth
}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

### Phase 3: Use all features with best model


In [None]:
# X will be all features in the training set excluding the target variable
X = train_features

# Y will represent the target variable
y = target_log

# Train test split and use holdout set because we have an actual testing set
X_train,X_holdout,y_train,y_holdout = train_test_split(X,y,random_state = 42)

In [None]:
# Name variables
imp = Imputer()
ss = StandardScaler()
randomforest = RandomForestRegressor(verbose = 1, n_jobs=-1)

# Build Pipeline
pipe = Pipeline([ 
    ('imp',imp),
    ('ss', ss),
    ('randomforest', randomforest)
])

# Hyperparameters
params = {
    'imp__strategy': ['mean', 'median','most_frequent'], # Tune hyperparameters for imputation strategy
    'randomforest__n_estimators': [25,50,200], # Tune hyperparameters for n_estimators
    'randomforest__max_depth': [80,50,200], # Tune hyperparameters for max_depth
    'randomforest__max_features': ['auto','sqrt'] # Tune hyperparameters for max_features

}

# GridSearchCV model
gs = GridSearchCV(pipe, param_grid=params, cv=10)
gs.fit(X_train, y_train) # Fit and tune model

print("Best Score:",gs.best_score_) # Print Best Score
print("Best Params:",gs.best_params_) # Print Best parameters

In [None]:
# Print Holdout Accuracy
print("Holdout Accuracy:",gs.score(X_holdout,y_holdout))

In [None]:
randomforest.fit(X_train,y_train)
randomforest.feature_importances_

In [None]:
# Feature importance of this model 

feature_importance = pd.DataFrame({
        'Feature':X.columns,
        'Importance':randomforest.feature_importances_
    })

feature_importance.sort_values('Importance', ascending=False, inplace=True)
feature_importance.head(10)

In [None]:
# Plot Feature importance of this model 
fig = plt.figure(figsize=(10,4))

fig = sns.barplot(x='Feature', y='Importance', data=feature_importance.head(11))
fig.set_title('Random Forest Feature Importance');

## 6. Predicting on holdout and real set

In [None]:
# Prediction on holdout
pred = gs.predict(X_holdout)
# Calculate and print R^2 and MAE for holdout
print('R^2: ', r2_score(y_holdout, pred))
print('MAE: ', mean_absolute_error(y_holdout, pred))

In [None]:
# Plot Training and Holdout set predictions 

plt.figure(figsize=(12, 8))  # Figure size
plt.subplot(1,2,1)  # Subplot in row one column one
plt.scatter(x=y_train, y = gs.predict(X_train))
plt.ylabel('Predicted Prices', fontsize=14) # Y-label
plt.xlabel('True Prices', fontsize=14) # X-label
plt.tick_params(labelsize=15) # Label size
plt.title('Training set', fontsize=14) # Title of first subplot


plt.subplot(1,2,2) # Subplot in row one column two
plt.scatter(x=y_holdout, y = gs.predict(X_holdout))
plt.ylabel('Predicted Prices', fontsize=14) # Y-label
plt.xlabel('True Prices', fontsize=14) # X-label
plt.tick_params(labelsize=15) # Label size
plt.title('Holdout set', fontsize=14) # Title of first subplot
plt.savefig("../Images/Train_Holdout_predictions.jpg",bbox_inches='tight',pad_inches=.5); # Save image

plt.show() # Remove text

In [None]:
print(f'R^2 Score for Training: {gs.score(X_train, y_train)}')
print(f'R^2 Score fro Holdout: {gs.score(X_holdout, y_holdout)}')

In [None]:
# X_test
X_test = test_features

# Predictions
predictions_on_real_test = gs.best_estimator_.predict(X_test)
predictions_on_real_test = np.expm1(predictions_on_real_test)
result_on_real_test = X_test.copy()

# Show results on df for predictions
for_csv = pd.concat([test['Id'], pd.DataFrame(predictions_on_real_test, columns=['SalePrice'])], axis=1)
for_csv.head()

In [None]:
# for_csv.to_csv('submission.csv', index=False)