In [8]:
import os
import json
import argparse

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

#for timestamp column
from datetime import date

# matplotlib and seaborn for plotting
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

# hide warnings
import warnings
def ignore_warn(*args, **kwargs): pass
warnings.warn = ignore_warn

# machine learining
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

# Statistics
from scipy import stats
from scipy.stats import norm, skew #for some statistics

# Mathematics
from math import log

# Pandas options
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns',500 )
pd.set_option('display.max_rows',100 )
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
# pd.reset_option("display.max_rows")

# Save and Load Machine Learning Models
import pickle

In [2]:
# some auxiliary functions
def detect_outliers(df,n,features):
        """
        Takes a dataframe df of features and returns a list of the indices
        corresponding to the observations containing more than n outliers according
        to the Tukey method.
        """
        outlier_indices = []
        # iterate over features(columns)
        for col in features:
            # 1st quartile (25%)
            Q1 = np.percentile(df[col], 25)
            # 3rd quartile (75%)
            Q3 = np.percentile(df[col],75)
            # Interquartile range (IQR)
            IQR = Q3 - Q1

            # outlier step
            outlier_step = 1.5 * IQR

            # Determine a list of indices of outliers for feature col
            outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index

            # append the found outlier indices for col to the list of outlier indices 
            outlier_indices.extend(outlier_list_col)

        # select observations containing more than 2 outliers
        outlier_indices = Counter(outlier_indices)        
        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
        return multiple_outliers
    
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    
def plot_feature_importances(df):
        """
        Plot importances returned by a model. This can work with any measure of
        feature importance provided that higher importance is better. 

        Args:
            df (dataframe): feature importances. Must have the features in a column
            called `features` and the importances in a column called `importance

        Returns:
            shows a plot of the 15 most importance features

            df (dataframe): feature importances sorted by importance (highest to lowest) 
            with a column for normalized importance
            """

        # Sort features according to importance
        df = df.sort_values('importance', ascending = False).reset_index()

        # Normalize the feature importances to add up to one
        df['importance_normalized'] = df['importance'] / df['importance'].sum()

        # Make a horizontal bar chart of feature importances
        plt.figure(figsize = (10, 6))
        ax = plt.subplot()

        # Need to reverse the index to plot most important on top
        ax.barh(list(reversed(list(df.index[:15]))), 
                df['importance_normalized'].head(15), 
                align = 'center', edgecolor = 'k')

        # Set the yticks and labels
        ax.set_yticks(list(reversed(list(df.index[:15]))))
        ax.set_yticklabels(df['feature'].head(15))

        # Plot labeling
        plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
        plt.show()
        return df


In [42]:
# def load_data(csv_name, load_from_disk = False):
# #     if load_from_disk:
# #         return pd.read_pickle(csv_name + '.pkl')
    
#     data = pd.read_csv(csv_name + '.csv')
# #     data.to_pickle(csv_name + ".pkl")
# #     return data

In [3]:
def load_data(csv_name, load_from_disk = False):
    #if load_from_disk:
    #    return pd.read_pickle('C:\Users\User\Kaggle\Final_Project\' + csv_name + '.pkl')
    
    data = pd.read_csv('C:/Users/User/Kaggle/Final_Project/' + csv_name + '.csv')
    #data.to_pickle(csv_name + ".pkl")
    return data

In [4]:
class descriptive_statistics:
    def __init__(self, df, label):
        self.df = df
        self.label = label
        
    def corrmat(self):
        return self.df.corr()
        
    def most_correlated_features(self, print_heatmap = True ,tol=0.5):
        corrmat = self.df.corr()
        top_corr_features = corrmat.index[abs(corrmat[self.label]) > tol]
        if print_heatmap:
            plt.figure(figsize=(10,10))
            sns.heatmap(self.df[top_corr_features].corr(),annot=True,cmap="RdYlGn")
        return top_corr_features.values
    
#     def graph_between_most_correlated_features(self, tol=0.5):
#         sns.set()
#         corrmat = self.df.corr()
#         top_corr_features = corrmat.index[abs(corrmat[self.label]) > tol]
# #         corrmat.index[abs(corrmat[self.label]) > tol]
#         cols = top_corr_features.values
#         sns.pairplot(self[cols], size = 2.5)
#         plt.show();
    
    def mean(self, feature):
        return np.mean(self.df[feature])
    
    def median(self, feature):
        return np.median(self.df[feature])
#     def mode(self, feature):
#         return np.mod(self.df[feature])
    
    def standard_deviation(self, feature):
        return np.std(self.df[feature])

In [5]:
def data_preprocessing(data, is_train_dataset = False):
#     print('---------------------------------------')
#     print('Starting preprocessing')
#     print('---------------------------------------')
#     print('checking the volume of missing values')
#     print('---------------------------------------')
#     missing_values_table(data)
#     print('---------------------------------------')
#     data.drop("Id", axis = 1, inplace = True)
    if is_train_dataset:
#         print('Delete outliers for training dataset')
#         print('---------------------------------------')
        ddxk = descriptive_statistics(df=data,label='SalePrice' )
        Outliers_to_drop = detect_outliers(data,2, ddxk.most_correlated_features(print_heatmap = False))
        data = data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
    
    data["PoolQC"] = data["PoolQC"].fillna("None")
    data["MiscFeature"] = data["MiscFeature"].fillna("None")
    data["Alley"] = data["Alley"].fillna("None")
    data["Fence"] = data["Fence"].fillna("None")
    data["FireplaceQu"] = data["FireplaceQu"].fillna("None")
    data["LotFrontage"] = data.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median()))
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
        data[col] = data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        data[col] = data[col].fillna(0)
    for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
        data[col] = data[col].fillna(0)
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        data[col] = data[col].fillna('None')
    data["MasVnrType"] = data["MasVnrType"].fillna("None")
    data["MasVnrArea"] = data["MasVnrArea"].fillna(0)
    data['MSZoning'] = data['MSZoning'].fillna(data['MSZoning'].mode()[0])
    data = data.drop(['Utilities'], axis=1)
    data["Functional"] = data["Functional"].fillna("Typ")
    data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])
    data['KitchenQual'] = data['KitchenQual'].fillna(data['KitchenQual'].mode()[0])
    data['Exterior1st'] = data['Exterior1st'].fillna(data['Exterior1st'].mode()[0])
    data['Exterior2nd'] = data['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])
    data['SaleType'] = data['SaleType'].fillna(data['SaleType'].mode()[0])
    data['MSSubClass'] = data['MSSubClass'].fillna("None")
    
    #MSSubClass=The building class
    data['MSSubClass'] = data['MSSubClass'].apply(str)

    #Changing OverallCond into a categorical variable
    data['OverallCond'] = data['OverallCond'].astype(str)


    #Year and month sold are transformed into categorical features.
    data['YrSold'] = data['YrSold'].astype(str)
    data['MoSold'] = data['MoSold'].astype(str)

    from sklearn.preprocessing import LabelEncoder
    cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
            'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
            'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
            'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
            'YrSold', 'MoSold')
    # process columns, apply LabelEncoder to categorical features
    for c in cols:
        lbl = LabelEncoder() 
        lbl.fit(list(data[c].values)) 
        data[c] = lbl.transform(list(data[c].values))
    
    # Adding total sqfootage feature 
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
    
    data = pd.get_dummies(data)

    # shape        
#     print('Shape data: {}'.format(data.shape))
#     print('---------------------------------------')
#     print('The preprocessing stage is already done')
#     print('---------------------------------------')
#     print('checking the volume of missing values again')
#     print('---------------------------------------')
#     missing_values_table(data)    
#     print('---------------------------------------')
    return data

In [6]:
# for training data
def label_distribution(data, label):
    sns.distplot(data[label] , fit=norm);

    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(data[label])
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

    #Now plot the distribution
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
                loc='best')
    plt.ylabel('Frequency')
    plt.title(label + ' distribution')

    #Get also the QQ-plot
    fig = plt.figure()
    res = stats.probplot(data[label], plot=plt)
    plt.show()

In [7]:
def log_transformation(data, label, show_graph = False):
    #We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
    data[label] = np.log1p(data[label])
    if show_graph:
        #Check the new distribution 
        sns.distplot(data[label] , fit=norm);

        # Get the fitted parameters used by the function
        (mu, sigma) = norm.fit(data[label])
        print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

        #Now plot the distribution
        plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
                    loc='best')
        plt.ylabel('Frequency')
        plt.title(label + ' distribution')

        #Get also the QQ-plot
        fig = plt.figure()
        res = stats.probplot(data[label], plot=plt)
        plt.show()

In [48]:
#Validation function
n_folds = 5
def rmsle_cv(model, X, Y):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X.values)
    rmse= np.sqrt(-cross_val_score(model, X.values, Y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [49]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1) 

In [156]:
# def main(args, load_from_disk=False):
#     train_data = load_data(csv_name='train', load_from_disk=False)
#     print('********** Preprocessing Phase **********')
#     train_data = data_preprocessing(data=train_data,is_train_dataset=True)
#     print('********** Label Distribution **********')
#     label_distribution(train_data, label='SalePrice')
#     print('********** Log Transformation **********')
#     log_transformation(data=train_data, label='SalePrice')
    
#     X = train_data.drop(columns=["SalePrice"])
#     Y = train_data["SalePrice"]
    
#     test_size = 0.33
#     seed = 7
#     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
#     # Fit the model on 33%
# #     ddxk = X_test.head(1)
# #     ddxk_value = Y_test.head(1)
    
#     lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
#     ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
#     KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
#     GBoost = GradientBoostingRegressor()
#     model_xgb = xgb.XGBRegressor()
#     model_lgb = lgb.LGBMRegressor(objective='regression')
    
#     print("\nLasso score: {:.4f} ({:.4f})\n".format(rmsle_cv(lasso, X_train, Y_train).mean(), rmsle_cv(lasso, X_train, Y_train).std()))
#     print("ElasticNet score: {:.4f} ({:.4f})\n".format(rmsle_cv(ENet, X_train, Y_train).mean(), rmsle_cv( ENet , X_train, Y_train).std()))
#     print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(rmsle_cv(KRR, X_train, Y_train).mean(), rmsle_cv( KRR , X_train, Y_train).std()))
#     print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(rmsle_cv(GBoost, X_train, Y_train).mean(), rmsle_cv( GBoost , X_train, Y_train).std()))
#     print("XGboost score: {:.4f} ({:.4f})\n".format(rmsle_cv(model_xgb, X_train, Y_train).mean(), rmsle_cv( model_xgb , X_train, Y_train).std()))
#     print("LGBM score: {:.4f} ({:.4f})\n" .format(rmsle_cv(model_lgb, X_train, Y_train).mean(), rmsle_cv( model_lgb , X_train, Y_train).std()))

#     averaged_models = AveragingModels(models = (lasso , ENet , GBoost, model_xgb, model_lgb))
#     print("Averaged base models score: {:.4f} ({:.4f})\n".format(rmsle_cv(averaged_models, X_train, Y_train).mean(), rmsle_cv(averaged_models, X_train, Y_train).std()))
    
    
# #     test_data = load_data(csv_name='test', load_from_disk=False)
# #     test_data = data_preprocessing(data=test_data,is_train_dataset=False)
# #     test_data.head()
# #     missing_cols = set(X.columns ) - set(test_data.columns )
# #     # Add a missing column in test set with default value equal to 0
# #     for c in missing_cols:
# #         test_data[c] = 0
# #     # Ensure the order of column in the test set is in the same order than in train set
# #     test_data = X[X.columns]
# # #     y_pred = boost.predict(test_X)[0]
# # #     print("predicted duration is %f days" % y_pred)
# # #     print("actual duration is %f days" % test_Y)
    
# #     print("--------------------------------")
# #     print("--------------------------------")
# #     print("--------------------------------")
# #     print(cross_val_score(averaged_models, X_train, Y_train, cv=5, verbose=True))
    
    
#     averaged_models.fit(X_train, Y_train)
#     averaged_train_pred = averaged_models.predict(X_train)
#     y_pred = np.expm1(averaged_models.predict(X_test))#[0]
#     print("\nMy final model is averaged model\n")
#     print("RMSE : {:.4f}\n".format(rmsle(Y_train, averaged_train_pred)))
    
#     X_test.loc[:,'Outcome'] = y_pred
#     print("training has been completed succesfully !!!!")
#     print("--------------------------------------------")
#     X_test["Actual_Values"] = np.expm1(Y_test)
# #     print("predicted sale price value is %f " % y_pred)
#     print("Prediction")
#     print(X_test[['Actual_Values','Outcome']])
    
# #     print("actual sale price is %f " % np.expm1(ddxk_value))
    
    
# #     filename = 'finalized_model.sav'
# #     pickle.dump(averaged_models, open(filename, 'wb'))
    
    
# #     # some time later...

# #     # load the model from disk
# #     loaded_model = pickle.load(open(filename, 'rb'))
# #     result = loaded_model.score(X_test, Y_test)
# #     print("Results on testing data set !!!\n")
# #     print("Score on test data set : {:.4f}".format(result))

# main(args=None, load_from_disk=True)

In [50]:
def init():
    lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
    GBoost = GradientBoostingRegressor()
    model_xgb = xgb.XGBRegressor()
    model_lgb = lgb.LGBMRegressor(objective='regression')
    averaged_models = AveragingModels(models = (lasso , ENet , GBoost, model_xgb, model_lgb))
    train_data = load_data(csv_name='train', load_from_disk=False)
    train_data = data_preprocessing(data=train_data,is_train_dataset=True)
#     label_distribution(train_data, label='SalePrice')
    log_transformation(data=train_data, label='SalePrice', show_graph=False)
    X = train_data.drop(columns=["SalePrice"])
    Y = train_data["SalePrice"]
    averaged_models.fit(X, Y)
    np.save('col.npy', X.columns)
    print("training has been completed succesfully !!!!")
    print("--------------------------------------------")
    filename = 'finalized_model.sav'
    pickle.dump(averaged_models, open(filename, 'wb'))
    return averaged_models

In [51]:
# Here supposed that test_X is a dataframe with 1 row, for which we must make a prediction
def prediction(model, test_X):
    train_col = np.load('col.npy')
#     test_X = data_preprocessing(data=test_X, is_train_dataset=False)

    missing_cols = set(train_col) - set(test_X.columns )
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        test_X[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    test_X = test_X[train_col]
    y_pred = model.predict(test_X)[0]    
    return np.expm1(y_pred)

In [78]:
# train_data = load_data(csv_name='train', load_from_disk=False)
# ddd = descriptive_statistics(df = train_data, label='SalePrice')
# mcf = ddd.most_correlated_features(print_heatmap=False)
# print(mcf)

In [58]:
dict = {
    'OverallQual' : 5 ,
    'YearBuilt' : 1961,
    'YearRemodAdd' : 1961,
    'TotalBsmtSF' : 882.000 ,
    '1stFlrSF' : 896,
    'GrLivArea' : 896,
    'FullBath' : 1,
    'TotRmsAbvGrd' : 5,
    'GarageCars' : 1.000,
    'GarageArea' : 730.000
}

# /api/foo/?OverallQual=5&YearBuilt=1961&YearRemodAdd=1961&TotalBsmtSF=882.000&1stFlrSF=896&GrLivArea=896&FullBath=1&TotRmsAbvGrd=5&GarageCars=1.000&GarageArea=730.000
# http://localhost:5000/api/foo/?OverallQual=5&YearBuilt=1961&YearRemodAdd=1961&TotalBsmtSF=882&1stFlrSF=896&GrLivArea=896&FullBath=1&TotRmsAbvGrd=5&GarageCars=1&GarageArea=730
# df = pd.DataFrame.from_dict(dict,orient='columns')
df = pd.DataFrame(dict, index=[0])

In [79]:
# test_data = load_data(csv_name='test', load_from_disk=False)
# # test_data.head(1)
# obs = test_data[['OverallQual', 'YearBuilt', 'YearRemodAdd' ,'TotalBsmtSF' ,'1stFlrSF', 'GrLivArea' ,'FullBath' ,'TotRmsAbvGrd' ,'GarageCars', 'GarageArea']].head(1)

In [60]:
my_model = init()
prediction(my_model, df)

training has been completed succesfully !!!!
--------------------------------------------


65760.92965667119

In [81]:
# # train_col = np.load('col.npy')
# averaged_models = pickle.load(open('finalized_model.sav', 'rb'))
# prediction(averaged_models, df)

In [84]:
# def parse_args(*argument_array):
#     parser = argparse.ArgumentParser()
    
#     parser.add_argument('--load_model', '-l'
#                         ,help='load model from disk')
#     parser.add_argument('--load_from_disk', '-ld'
#                         , help='load data from disk')
#     args = parser.parse_args(*argument_array)
#     return args

In [22]:
# def main(args):
#     if args.load_model:
#         averaged_models = pickle.load(open('finalized_model.sav', 'rb'))
#     else:
#         averaged_models = init()
#         pickle.dump(averaged_models, open('finalized_model.sav', 'wb'))
#     while True:
#         predict(averaged_models,args)
# if __name__ == '__main__':
#     args = parse_args()
#     main(args)

In [86]:
# # from sklearn.externals import joblib
# # boost = joblib.load('finalized_model.sav')
# loaded_model = pickle.load(open('C:/Users/User/Kaggle/Final_Project/finalized_model.sav', 'rb'))

In [14]:
# train_col = np.load('col.npy')
# print(train_col)
train_col = ['Id' 'MSSubClass' 'LotFrontage' 'LotArea' 'Street' 'Alley' 'LotShape'
 'LandSlope' 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd'
 'MasVnrArea' 'ExterQual' 'ExterCond' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'HeatingQC' 'CentralAir' '1stFlrSF' '2ndFlrSF'
 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath' 'FullBath'
 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual' 'TotRmsAbvGrd'
 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageYrBlt' 'GarageFinish'
 'GarageCars' 'GarageArea' 'GarageQual' 'GarageCond' 'PavedDrive'
 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch' '3SsnPorch' 'ScreenPorch'
 'PoolArea' 'PoolQC' 'Fence' 'MiscVal' 'MoSold' 'YrSold' 'TotalSF'
 'MSZoning_C (all)' 'MSZoning_FV' 'MSZoning_RH' 'MSZoning_RL'
 'MSZoning_RM' 'LandContour_Bnk' 'LandContour_HLS' 'LandContour_Low'
 'LandContour_Lvl' 'LotConfig_Corner' 'LotConfig_CulDSac' 'LotConfig_FR2'
 'LotConfig_FR3' 'LotConfig_Inside' 'Neighborhood_Blmngtn'
 'Neighborhood_Blueste' 'Neighborhood_BrDale' 'Neighborhood_BrkSide'
 'Neighborhood_ClearCr' 'Neighborhood_CollgCr' 'Neighborhood_Crawfor'
 'Neighborhood_Edwards' 'Neighborhood_Gilbert' 'Neighborhood_IDOTRR'
 'Neighborhood_MeadowV' 'Neighborhood_Mitchel' 'Neighborhood_NAmes'
 'Neighborhood_NPkVill' 'Neighborhood_NWAmes' 'Neighborhood_NoRidge'
 'Neighborhood_NridgHt' 'Neighborhood_OldTown' 'Neighborhood_SWISU'
 'Neighborhood_Sawyer' 'Neighborhood_SawyerW' 'Neighborhood_Somerst'
 'Neighborhood_StoneBr' 'Neighborhood_Timber' 'Neighborhood_Veenker'
 'Condition1_Artery' 'Condition1_Feedr' 'Condition1_Norm'
 'Condition1_PosA' 'Condition1_PosN' 'Condition1_RRAe' 'Condition1_RRAn'
 'Condition1_RRNe' 'Condition1_RRNn' 'Condition2_Artery'
 'Condition2_Feedr' 'Condition2_Norm' 'Condition2_PosA' 'Condition2_RRAe'
 'Condition2_RRAn' 'Condition2_RRNn' 'BldgType_1Fam' 'BldgType_2fmCon'
 'BldgType_Duplex' 'BldgType_Twnhs' 'BldgType_TwnhsE' 'HouseStyle_1.5Fin'
 'HouseStyle_1.5Unf' 'HouseStyle_1Story' 'HouseStyle_2.5Fin'
 'HouseStyle_2.5Unf' 'HouseStyle_2Story' 'HouseStyle_SFoyer'
 'HouseStyle_SLvl' 'RoofStyle_Flat' 'RoofStyle_Gable' 'RoofStyle_Gambrel'
 'RoofStyle_Hip' 'RoofStyle_Mansard' 'RoofStyle_Shed' 'RoofMatl_CompShg'
 'RoofMatl_Membran' 'RoofMatl_Metal' 'RoofMatl_Roll' 'RoofMatl_Tar&Grv'
 'RoofMatl_WdShake' 'RoofMatl_WdShngl' 'Exterior1st_AsbShng'
 'Exterior1st_AsphShn' 'Exterior1st_BrkComm' 'Exterior1st_BrkFace'
 'Exterior1st_CBlock' 'Exterior1st_CemntBd' 'Exterior1st_HdBoard'
 'Exterior1st_ImStucc' 'Exterior1st_MetalSd' 'Exterior1st_Plywood'
 'Exterior1st_Stone' 'Exterior1st_Stucco' 'Exterior1st_VinylSd'
 'Exterior1st_Wd Sdng' 'Exterior1st_WdShing' 'Exterior2nd_AsbShng'
 'Exterior2nd_AsphShn' 'Exterior2nd_Brk Cmn' 'Exterior2nd_BrkFace'
 'Exterior2nd_CBlock' 'Exterior2nd_CmentBd' 'Exterior2nd_HdBoard'
 'Exterior2nd_ImStucc' 'Exterior2nd_MetalSd' 'Exterior2nd_Other'
 'Exterior2nd_Plywood' 'Exterior2nd_Stone' 'Exterior2nd_Stucco'
 'Exterior2nd_VinylSd' 'Exterior2nd_Wd Sdng' 'Exterior2nd_Wd Shng'
 'MasVnrType_BrkCmn' 'MasVnrType_BrkFace' 'MasVnrType_None'
 'MasVnrType_Stone' 'Foundation_BrkTil' 'Foundation_CBlock'
 'Foundation_PConc' 'Foundation_Slab' 'Foundation_Stone' 'Foundation_Wood'
 'Heating_Floor' 'Heating_GasA' 'Heating_GasW' 'Heating_Grav'
 'Heating_OthW' 'Heating_Wall' 'Electrical_FuseA' 'Electrical_FuseF'
 'Electrical_FuseP' 'Electrical_Mix' 'Electrical_SBrkr'
 'GarageType_2Types' 'GarageType_Attchd' 'GarageType_Basment'
 'GarageType_BuiltIn' 'GarageType_CarPort' 'GarageType_Detchd'
 'GarageType_None' 'MiscFeature_Gar2' 'MiscFeature_None'
 'MiscFeature_Othr' 'MiscFeature_Shed' 'MiscFeature_TenC' 'SaleType_COD'
 'SaleType_CWD' 'SaleType_Con' 'SaleType_ConLD' 'SaleType_ConLI'
 'SaleType_ConLw' 'SaleType_New' 'SaleType_Oth' 'SaleType_WD'
 'SaleCondition_Abnorml' 'SaleCondition_AdjLand' 'SaleCondition_Alloca'
 'SaleCondition_Family' 'SaleCondition_Normal' 'SaleCondition_Partial']

In [15]:
## Some functions for Python Application

In [19]:
df = load_data(csv_name='train', load_from_disk=False)
Stat = descriptive_statistics(df=df, label='SalePrice')

In [23]:
# Stat.most_correlated_features(print_heatmap=False, tol=0.7)
# Stat.mean(feature='OverallQual')
# Stat.median(feature='OverallQual')
# df = pd.DataFrame(Stat.most_correlated_features(print_heatmap=False))
# df
a = pd.DataFrame(np.delete(Stat.most_correlated_features(print_heatmap=False), 10))
# a

In [25]:
df.columns.shape

(81,)

In [62]:
dict2 = {
    'OverallQual' : "5" ,
    'YearBuilt' : "1961",
    'YearRemodAdd' : "1961",
    'TotalBsmtSF' : "882.000" ,
    '1stFlrSF' : "896",
    'GrLivArea' : "896",
    'FullBath' : "1",
    'TotRmsAbvGrd' : "5",
    'GarageCars' : "1.000",
    'GarageArea' : "730.000"
}

In [68]:
# newDict = dict(zip(dict2.keys(), [float(value) for value in dict2.values()]))
newDict = {} # an empty dictionary
for key, value in dict2.items(): # get the (key, value) tuples one at a time
    newDict[key] = float(value)
df2 = pd.DataFrame(newDict, index=[0])
prediction(my_model, df2)
# df2

65760.92965667119