In [1]:
# deplyment libraries
import pickle

# Importing required framework libraries
import pandas as pd
import numpy as np

# Import required pipeline and transformation libraries
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import processing and evaluation libraries
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.model_selection import GridSearchCV

# Regressors for Data
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

# Regressors for Imputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# Import features

In [2]:
#Import training features
import pathlib
path_to_read_model = 'C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\my_env_codebasics\\My_CODE\\ML-projects\\advanced_House_Price_Prediction-Regression\\probleam_study' #Path of current working Directory
with open(path_to_read_model + '\\trainFeatures_list.pkl', 'rb') as f:
    X, y = pickle.load(f)
    
#Import testing features
import pathlib
path_to_read_model = 'C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\my_env_codebasics\\My_CODE\\ML-projects\\advanced_House_Price_Prediction-Regression\\probleam_study' #Path of current working Directory
with open(path_to_read_model + '\\testFeatures_list.pkl', 'rb') as f:
    X_test, y_test = pickle.load(f)

# Save Column Names of feature Training DataSet

In [3]:
feature_ColumnNames = X.columns
import pathlib
path_to_write_output=str(pathlib.Path.cwd()) #Path of current working Directory
with open(path_to_write_output + '\\feature_ColumnNames.pkl', 'wb') as handle:
 pickle.dump(feature_ColumnNames, handle)

In [4]:
feature_ColumnNames

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony'],
      dtype='object')

# Data PreProcessing, Transformation and Outlier Elimination

In [4]:
#Transformation function
def Transformations(featureDF, targetDF):
    # Load the PreProcessing Objects needed for Transformations
    #Import training features
    import pathlib
    path_to_read_model = 'C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\my_env_codebasics\\My_CODE\\ML-projects\\advanced_House_Price_Prediction-Regression\\preProcessingObjects_list' #Path of current working Directory
    with open(path_to_read_model + '\\preProcessingObjects_list.pkl', 'rb') as f:
        preProcessingObjects_list = pickle.load(f)
    location_stats_greater_than_10 = preProcessingObjects_list[0]
    
    
    # Feature Selection via Business knoledge. Society is dependednt on location
    featureDF.drop(['society'], axis=1, inplace=True)
    
    # Duplicate elimination
    featureDF.drop_duplicates(inplace=True)
    # Update y matrix based X
    ## since we've removed some data from X, we need to pass on these updations to y as well, as y doesn't know some of its corresponding X's have been deleted.
    targetDF = targetDF[featureDF.index]
    
    # convert cat-col of total_sqft to float with null values also
    def convert_sqft_to_num(curr_tuple):
        try:
            tokens = curr_tuple.split('-')
            if len(tokens) == 2:
                return (float(tokens[0])+float(tokens[1]))/2
            return float(curr_tuple)
        except:
            return np.NaN
    
    featureDF['total_sqft'] = featureDF.total_sqft.apply(convert_sqft_to_num)
    
    # convert cat-col of availability to make both Immediate Possession and Ready To Move same and take only month, include null also.
    def convert_availability(curr_tuple):
        try:
            curr_tuple = curr_tuple.lower()
            if curr_tuple == 'ready to move' or curr_tuple == 'immediate possession':
                return 'available_currently'
            tokens = curr_tuple.split('-')
            if len(tokens) == 2:
                return tokens[1].strip()
        except:
            return np.NaN
        
    featureDF['availability'] = featureDF.availability.apply(convert_availability)
    
    # convert cat-col of size to number by the frist value with null value also
    def convert_size_to_num(curr_tuple):
        try:
            tokens = curr_tuple.split(' ')
            return float(tokens[0])
        except:
            return np.NaN
        
    featureDF['size'] = featureDF['size'].apply(convert_size_to_num)
    
    # Convert location to reduce the unique values in the column
    X['location'] = X['location'].apply(lambda x: x if (x in location_stats_greater_than_10) else 'other')
    
    # Update y matrix based X
    ## since we've removed some data from X, we need to pass on these updations to y as well, as y doesn't know some of its corresponding X's have been deleted.
    targetDF = targetDF[featureDF.index]
    
    return featureDF, targetDF

In [5]:
# Outlier Removal function
def outlierRemoval(featureDF, targetDF):
    import json
    outlier_Dict_DirPath = 'C:\\Users\\koriv\\Desktop\\MachineLearning_DataScience\\Hands_On_Machine_Learning\\my_env_codebasics\\My_CODE\\ML-projects\\advanced_House_Price_Prediction-Regression\\preProcessingObjects_list'
    with open(outlier_Dict_DirPath+'\\outlier_Dict.json') as json_file:
        outlier_Dict = json.load(json_file)
    
    for column in outlier_Dict:
        lower_limit =outlier_Dict[column]['lower_limit']
        upper_limit =outlier_Dict[column]['upper_limit']
        print("column: %s, lower_limit: %s, upper_limit: %s"%(column, lower_limit, upper_limit))
        featureDF = featureDF[((featureDF[column]>lower_limit)&(featureDF[column]<upper_limit))| (featureDF[column].isna())]
    
    # Update y matrix based X
    ## since we've removed some data from X, we need to pass on these updations to y as well, as y doesn't know some of its corresponding X's have been deleted.
    targetDF = targetDF[featureDF.index]
    
    return featureDF, targetDF  

In [6]:
# Define a function to apply the imported functions
def preProcessing(featureDF, targetDF, function_list):
    for function in function_list:
        featureDF, targetDF = function(featureDF, targetDF)
    return featureDF, targetDF

X, y = preProcessing(X, y, [Transformations, outlierRemoval])

column: size, lower_limit: 0.5, upper_limit: 4.5
column: total_sqft, lower_limit: 215.0, upper_limit: 2575.0
column: bath, lower_limit: 0.5, upper_limit: 4.5
column: balcony, lower_limit: -0.5, upper_limit: 3.5


# Check the feature columns

In [7]:
## Numerical columns in the DataFrame
num_cols=['size', 'total_sqft', 'bath', 'balcony']
## cols_to_be_OE
col_catO = ['area_type']
## cols_to_be_OHE
col_catN = ['availability', 'location']

# Ordinal Encoding

In [8]:
# Ordinal Encoder
## Ordinal Encoder values for area_type
area_type_unique =['Plot  Area', 'Built-up  Area', 'Super built-up  Area', 'Carpet  Area'] #In ascending order
## Pipeline for imputer and Ordinal Encoder
pp_catO = Pipeline([
    ('col_catO', SimpleImputer(strategy='constant', add_indicator=True, fill_value='Plot  Area')),
    ('catO', OrdinalEncoder(categories=[area_type_unique]))
])

# CloumnTransfer for Encoding
ct_O = ColumnTransformer([
    ('pp_catO', pp_catO, col_catO),
], remainder='passthrough')

In [9]:
ct_O.fit(X)
X[X.columns] = ct_O.transform(X)

# Pipelines for Nominal Encoding using OHE

In [10]:
# One Hot Encoder
## pipeline for imputer and One Hot Encoder for Nominal Columns
pp_catN = Pipeline([
    ('col_catN', SimpleImputer(strategy='constant', add_indicator=False, fill_value='missing')),
    ('catN', OneHotEncoder(sparse=False, handle_unknown='ignore'))
]) 

# Imputing and Scaling for Numerical columns[including Oridnal Clumns]

In [11]:
num_cols = num_cols+col_catO
num_cols

['size', 'total_sqft', 'bath', 'balcony', 'area_type']

In [12]:
#Pipeline to processes Numerical and Catogorical columns
pp_num = Pipeline([
    ('scaler', MinMaxScaler((0,1)))
])

# Column Transformer for Nominal Encoding and Imputing & Scaling for Numerical Columns[including Ordinal Columns]

In [13]:
ct_ohe_sca = ColumnTransformer([
    ('pp_catN', pp_catN, col_catN),
    ('pp_num', pp_num, num_cols)
], remainder='drop')

# Model Estimations

In [14]:
#'imputer__estimator':[BayesianRidge(),ExtraTreesRegressor(n_estimators=10, random_state=0),KNeighborsRegressor(n_neighbors=15)]
# Python Dictonary to store the models and their parameters
grid ={
    RandomForestRegressor(random_state = 0):{
        'model__n_estimators':[100,300],
        'model__max_depth':[5, 9, 13],
        'model__min_samples_split':[2,4,8],
        'imputer__estimator':[BayesianRidge()]
        },
    Lasso(tol=0.0001, random_state = 0):{
        'model__alpha':[0.1, 1, 10],
        'model__max_iter':[100, 1000],
        'imputer__estimator':[BayesianRidge()]
    }
}

In [15]:
#Empty DataFrame to store the results from the CrossValidation Matrix
full_df = pd.DataFrame()
best_algos = {}

#Iterate and fit the above specified Model and parameter dictonary
for curr_model, model_params in grid.items():
    print(">>Model: ",curr_model)
    print("parameters: ",model_params)
    print('\n')
    
    # pipelne with the Data Transformations and model
    pipe = Pipeline([
        ('ct_ohe_sca', ct_ohe_sca),
        ('imputer', IterativeImputer()),
        ('model', curr_model)
    ])
    
    ##GridSearch K folds cross validation definition with current model and its parameters
    cv_curr =  GridSearchCV(pipe, model_params, cv=5, return_train_score=False, scoring='neg_root_mean_squared_error', verbose=3, n_jobs=-1) # optionally can use ‘explained_variance’ in the scoring for scoring based on adjusted R2.
    #cv_curr =  RandomizedSearchCV(curr_model, model_params, cv=5, n_iter=100, return_train_score=False, scoring='accuracy', verbose=0, n_jobs=-1)
    ##fit the data to the defined grid search
    cv_curr.fit(X, y)
    
    ##Create a DataFrame out of the CrossValidation results
    all_res = pd.DataFrame(cv_curr.cv_results_)
    
    ##Create a temp Datframe with only values of 'params', 'mean_test_score' from CrossValidation results
    temp = all_res.loc[:, ['params', 'mean_test_score']]
    
    ##Get the name of the Model in use from the specified Model and parameter dictonary
    algo_name = str(curr_model).split('(')[0]
    temp['algo'] = algo_name
    
    ##Merge the temporary dataframes and results to final DataFrame and Dictonary
    full_df = pd.concat([full_df, temp])
    best_algos[algo_name]={}
    best_algos[algo_name]['best_estimator'] = cv_curr.best_estimator_
    best_algos[algo_name]['best_mean_test_score'] = cv_curr.best_score_
    print('-'*50)

>>Model:  RandomForestRegressor(random_state=0)
parameters:  {'model__n_estimators': [100, 300], 'model__max_depth': [5, 9, 13], 'model__min_samples_split': [2, 4, 8], 'imputer__estimator': [BayesianRidge()]}


Fitting 5 folds for each of 18 candidates, totalling 90 fits
--------------------------------------------------
>>Model:  Lasso(random_state=0)
parameters:  {'model__alpha': [0.1, 1, 10], 'model__max_iter': [100, 1000], 'imputer__estimator': [BayesianRidge()]}


Fitting 5 folds for each of 6 candidates, totalling 30 fits
--------------------------------------------------


In [69]:
# pipelne with the Data Transformations and model
#pipe = Pipeline([
#    ('ct_ohe_sca', ct_ohe_sca),
#    ('imputer', IterativeImputer()),
#    ('model', RandomForestRegressor(random_state = 0))
#])
#
## parameters of the model
#params = {
#    'model__n_estimators':[18, 36, 216],
#    'model__max_depth':[5, 9, 13],
#    'model__min_samples_split':[2,4,6,8],
#    'imputer__estimator':[KNeighborsRegressor(n_neighbors=15)]
#}
#
## CrossValidation
#gs = GridSearchCV(pipe, param_grid=params, cv=5, return_train_score=False, scoring='neg_root_mean_squared_error', verbose=3,n_jobs=-1)

In [70]:
# fit the data
#gs.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct_ohe_sca',
                                        ColumnTransformer(transformers=[('pp_catN',
                                                                         Pipeline(steps=[('col_catN',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('catN',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['availability',
                                                 

In [71]:
#pd.pandas.set_option('display.max_columns', None)
#pd.DataFrame(gs.cv_results_).loc[:, ['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
15,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.468456
23,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.493658
14,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.652725
1,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.674995
3,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.678507
22,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.704918
7,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.716374
5,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.720507
13,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.750041
0,{'imputer__estimator': KNeighborsRegressor(n_n...,-44.762326


In [18]:
# Check for the best Regressor Model
pd.pandas.set_option('display.max_columns', None)
full_df.sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score,algo
17,"{'imputer__estimator': BayesianRidge(), 'model...",-44.480102,RandomForestRegressor
11,"{'imputer__estimator': BayesianRidge(), 'model...",-44.554485,RandomForestRegressor
16,"{'imputer__estimator': BayesianRidge(), 'model...",-44.578146,RandomForestRegressor
10,"{'imputer__estimator': BayesianRidge(), 'model...",-44.584909,RandomForestRegressor
2,"{'imputer__estimator': BayesianRidge(), 'model...",-44.738451,RandomForestRegressor
0,"{'imputer__estimator': BayesianRidge(), 'model...",-44.752784,RandomForestRegressor
4,"{'imputer__estimator': BayesianRidge(), 'model...",-44.78042,RandomForestRegressor
9,"{'imputer__estimator': BayesianRidge(), 'model...",-44.895597,RandomForestRegressor
5,"{'imputer__estimator': BayesianRidge(), 'model...",-44.899281,RandomForestRegressor
3,"{'imputer__estimator': BayesianRidge(), 'model...",-44.904252,RandomForestRegressor


In [17]:
#View the best Regressor Models per each Algorithm and thier scores on the Training Data
best_algos

{'RandomForestRegressor': {'best_estimator': Pipeline(steps=[('ct_ohe_sca',
                   ColumnTransformer(transformers=[('pp_catN',
                                                    Pipeline(steps=[('col_catN',
                                                                     SimpleImputer(fill_value='missing',
                                                                                   strategy='constant')),
                                                                    ('catN',
                                                                     OneHotEncoder(handle_unknown='ignore',
                                                                                   sparse=False))]),
                                                    ['availability', 'location']),
                                                   ('pp_num',
                                                    Pipeline(steps=[('scaler',
                                                           

In [19]:
# Check for the best parameters and its score
print(">> best estimator: ",best_algos['RandomForestRegressor']['best_estimator'])
print(">> best_score: ",best_algos['RandomForestRegressor']['best_mean_test_score'])

>> best estimator:  Pipeline(steps=[('ct_ohe_sca',
                 ColumnTransformer(transformers=[('pp_catN',
                                                  Pipeline(steps=[('col_catN',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('catN',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['availability', 'location']),
                                                 ('pp_num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                  

# Saving the Ordinal Column Transformer and best estimators

In [20]:
estimators = [ct_O, best_algos]

In [21]:
import pathlib
path_to_write_output=str(pathlib.Path.cwd()) #Path of current working Directory
with open(path_to_write_output + '\\estimators.pkl', 'wb') as handle:
    pickle.dump(estimators, handle)