## Steps to convert raw web scraped data into ML model ready data

1. Get intial features as obtained from RAW file
2. Append to other datasets
3. Impute missing data
4. Convert to numericals
3. Use encoder to encode values
4. Use pandas get dummies to do onehot encoding for columns 
5. Convert to Vectors X and y

In [1]:
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

In [2]:
import pandas as pd
import json
import requests
import numpy as np
from fuzzywuzzy import fuzz
requests.packages.urllib3.disable_warnings() 
pd.set_option('display.max_colwidth',0)
pd.set_option('display.float_format',lambda x: '%.2f' %x)
pd.set_option('display.max_columns',0)
from datetime import datetime
import pickle

In [3]:
class CarDataStandardizer():
    
    def __init__(self):        
        print("Initialized standardizer")  
        
    def fit(self):
        return self
    
    def transform(self, df):
        # merge the data
        df_input = df.copy()
        
        # impute columns which are optional
        df_input['owner'].fillna(1,inplace = True)   
        df_input['accidenthist'].fillna('N',inplace = True)       
        df_input['usage'].fillna('PERSONAL',inplace = True)          
        
        df_input['age'] = datetime.now().year - df_input['year'].astype('int')
        df_input['odo'] = df_input['odometer'].astype('int') / 1000
        df_input = df_input.drop(columns = ['odometer'])
        
        df_input['owner'] = pd.cut(x=df_input['owner'], bins=[-1,0,1,20], labels=[0,1,2])

        
        for col in ['make','model','trim','state','colorexterior','colorinterior','accidenthist','usage']:
            df_input[col] = df_input[col].astype(str).apply(lambda x:x.upper().strip())           


        return df_input

In [4]:
class CarDataAugmentor():
    
    def __init__(self):
        
        # prepare augment data from files
        print("Initialized CarDataAugmentor") 
        
        self.df_category = pd.read_csv('car_category.csv', index_col = False)

        self.df_reliability = pd.read_csv('car_reliability_rankings.csv', index_col = False)
        self.df_reliability = self.df_reliability[['Make','ReliabilityRank']]

        self.df_cost = pd.read_csv('statewise_economic_indicators.csv',index_col = False)
        self.df_cost = self.df_cost[['State','CostOfLivingRank']]

        self.df_sales = pd.read_csv('car_sales.csv',index_col = False)
        self.df_sales = self.df_sales.drop(columns = ['TotalSales'])

        self.df_turn = pd.read_csv("used_car_time_to_turn.csv")
        self.df_turn['AvgDaysToTurn'] = self.df_turn.mean(axis=1)
        self.df_turn['Make'] = self.df_turn['Make'].str.upper()
        self.df_turn = self.df_turn[['Make','AvgDaysToTurn']]


        self.df_ratings = pd.read_csv('car_ratings.csv', index_col = False)
        self.df_ratings.drop_duplicates(subset = ['MakeModel'], inplace = True)
        self.df_ratings['AvgMPG'] = (self.df_ratings['MpgCity']  + self.df_ratings['MpgHwy']) / 2
        self.df_ratings.loc[self.df_ratings['CarClass'].str.contains(r'LUXURY|SPORTS|HYBRID'),'LuxurySportsOrHybrid'] = 'Y'
        self.df_ratings['LuxurySportsOrHybrid']  = self.df_ratings['LuxurySportsOrHybrid'].fillna('N')
        self.df_ratings = self.df_ratings[['MakeModel','ReviewScore','AvgMPG','LuxurySportsOrHybrid']]
 
    def fit(self):
        return self

    def transform(self, df):
        # merge the data
        df_input = df.copy()
        
        df_input = df_input.merge(self.df_category,how = 'left', 
                                  left_on = ['year','make','model'], 
                                  right_on = ['Year','Make','Model'])
        df_input = df_input.drop(columns = ['Year','Make','Model']).rename({'Category' : 'bodytype'},axis = 1)

        df_input = df_input.merge(self.df_reliability,how = 'left',left_on='make', right_on='Make')
        df_input = df_input.drop(columns = ['Make'])

        df_input = df_input.merge(self.df_cost,how = 'left',left_on=['state'], right_on=['State'])
        df_input = df_input.drop(columns = ['State'])

        df_input = df_input.merge(self.df_sales,how = 'left',left_on=['make'], right_on=['Make'])
        df_input = df_input.drop(columns = ['Make'])

        df_input = df_input.merge(self.df_turn,how = 'left',left_on=['make'], right_on=['Make'])
        df_input = df_input.drop(columns = ['Make'])

        # Function to do fuzzy matching of make and model combination to get ratings
        def getclass(makemodel):

            try:
                matches = self.df_ratings['MakeModel'].apply(lambda x:fuzz.ratio(x,makemodel))
                if matches.max() > 80:
                    return matches.idxmax()
                else:
                    return -1
            except:
                return -1


        df_input['makemodel'] = df_input['make'] + ' ' + df_input['model'] 
        df_input['matchindex'] = df_input['makemodel'].apply(getclass)
        df_input = df_input.merge(self.df_ratings, how = 'left', left_on = 'matchindex', right_index = True)
        df_input = df_input.drop(columns = ['makemodel','matchindex','MakeModel'])

        def getdrivetrain(trim):

            try:
                drivetrain = [d for d in ['AWD','RWD','FWD','4WD','2WD'] if d in trim]
                drivetrain = drivetrain[0] if len(drivetrain) > 0 else 'FWD'
                drivetrain = 'FWD' if drivetrain in ['FWD','2WD'] else 'AWD'
                return drivetrain
            except:
                return 'FWD'

        df_input['drivetrain'] = df_input['trim'].apply(getdrivetrain)
        
        df_input = df_input.drop(columns = ['year', 'make', 'model', 'trim', 'state'])

        return df_input


In [19]:


class OneHotTransformer(SklearnOneHotEncoder):
    
    def __init__(self):
        super().__init__()
        print("Initialized OneHotTransformer") 
        self.fit_flag = False

    def fit(self, X):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X):
        sparse_matrix = super().transform(X)
        new_columns = self.get_new_columns(X=X)
        print(new_columns)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_{self.categories_[i][j]}')
                j += 1
        return new_columns

In [20]:
in_data = [{
            "year" : 2014,
            "make" : "toyota",
            "model" : "corolla",
            "trim" : "le plus",
            "odometer" : 20700,
            "state" :  "AZ",
            "colorexterior" : "blue",
            "colorinterior" : "black",
            "accidenthist" : "n",
            "owner" : 5,
            "usage" : "personal"
            }]


df = pd.DataFrame.from_dict(in_data)

all_features = df.columns

categorical_features = ['colorexterior', 'colorinterior', 'accidenthist', 'owner', 'usage',
                       'bodytype','LuxurySportsOrHybrid', 'drivetrain']

In [21]:
df

Unnamed: 0,year,make,model,trim,odometer,state,colorexterior,colorinterior,accidenthist,owner,usage
0,2014,toyota,corolla,le plus,20700,AZ,blue,black,n,5,personal


In [22]:
a = CarDataStandardizer()
b = CarDataAugmentor()
c = OneHotTransformer()

df1 = a.transform(df)
df2 = b.transform(df1)
df3 = c.fit_transform(df2)
df3

Initialized standardizer
Initialized CarDataAugmentor
Initialized OneHotTransformer
['colorexterior_BLUE', 'colorinterior_BLACK', 'accidenthist_N', 'owner_2', 'usage_PERSONAL', 'age_6', 'odo_20.7', 'bodytype_SEDAN', 'ReliabilityRank_5', 'CostOfLivingRank_29', 'PercentSales_12.1872269830779', 'AvgDaysToTurn_43.23076923076923', 'ReviewScore_7.9', 'AvgMPG_32.0', 'LuxurySportsOrHybrid_N', 'drivetrain_FWD']


Unnamed: 0,colorexterior_BLUE,colorinterior_BLACK,accidenthist_N,owner_2,usage_PERSONAL,age_6,odo_20.7,bodytype_SEDAN,ReliabilityRank_5,CostOfLivingRank_29,PercentSales_12.1872269830779,AvgDaysToTurn_43.23076923076923,ReviewScore_7.9,AvgMPG_32.0,LuxurySportsOrHybrid_N,drivetrain_FWD
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
c.columns

In [None]:
colT = ColumnTransformer(
    [("oh", 
     SklearnOneHotEncoder(categories=[['WHITE', 'BLACK', 'SILVER', 'GRAY', 'BLUE', 'RED','OTHER'],
                                 ['BLACK', 'GRAY', 'BEIGE','OTHER'],
                                 ['Y', 'N'],
                                 ['0', '1','2'],
                                 ['PERSONAL', 'FLEET'],
                                 ['Y', 'N','U'],
                                 ['AWD', 'FWD']]
                                  ), 
    ['colorexterior', 'colorinterior', 'accidenthist', 'owner', 'usage','LuxurySportsOrHybrid', 'drivetrain']
    )]
    )

In [None]:
colT.fit_transform(c)

In [None]:
colT = ColumnTransformer(
    [("dummy", 
     SklearnOneHotEncoder(categories=[['WHITE', 'BLACK', 'SILVER', 'GRAY', 'BLUE', 'RED','OTHER'],
                                 ['BLACK', 'GRAY', 'BEIGE','OTHER'],
                                 ['Y', 'N'],
                                 ['0', '1','2'],
                                 ['PERSONAL', 'FLEET'],
                                 ['Y', 'N','U'],
                                 ['AWD', 'FWD']]
                                  ), 
    ['colorexterior', 'colorinterior', 'accidenthist', 'owner', 'usage','LuxurySportsOrHybrid', 'drivetrain']
    )]
    )

In [None]:
colT.fit_transform(c)

In [None]:
colT.get_feature_names() 

In [None]:
transform_pipeline = Pipeline(steps=[('standardize', CarDataStandardizer()), 
                                 ('augment',CarDataAugmentor())
                                 ])
df2 = transform_pipeline.transform(df)
df2

In [None]:
'colorexterior_BLACK', 'colorexterior_BLUE', 'colorexterior_GRAY',
                           'colorexterior_OTHER', 'colorexterior_RED', 'colorexterior_SILVER',
                           'colorexterior_WHITE', 'colorinterior_BEIGE', 'colorinterior_BLACK',
                           'colorinterior_GRAY', 'colorinterior_OTHER', 'bodytype_CONVERTIBLE',
                           'bodytype_COUPE', 'bodytype_HATCHBACK', 'bodytype_PICKUP',
                           'bodytype_SEDAN', 'bodytype_SUV', 'bodytype_TRUCK',
                           'bodytype_VAN/MINIVAN', 'bodytype_WAGON', 'accidenthist_N',
                           'accidenthist_Y', 'owner_0', 'owner_1', 'owner_2', 'usage_FLEET',
                           'usage_PERSONAL', 'LuxurySportsOrHybrid_N', 'LuxurySportsOrHybrid_U',
                           'LuxurySportsOrHybrid_Y', 'drivetrain_AWD', 'drivetrain_FWD'

In [None]:
oh = ColumnTransformer([
    (SklearnOneHotEncoder(categories=[['WHITE', 'BLACK', 'SILVER', 'GRAY', 'BLUE', 'RED','OTHER'],
                                      ['BLACK', 'GRAY', 'BEIGE','OTHER'],
                                      ['Y', 'N'],
                                      ['0', '1','2'],
                                      ['PERSONAL', 'FLEET'],
                                      ['Y', 'N','U'],
                                      ['AWD', 'FWD']]), 
                                        [0,1,2,3,4,10])
    )

In [None]:
preprocess = make_column_transformer(
                                    (CarDataStandardizer(),all_features),
                                    (CarDataAugmentor(),all_features),
                                    (OneHotTransformer(),categorical_features),
                                    remainder='passthrough'
                                    )

In [None]:
preprocess.fit_transform(df)

In [None]:
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline(steps=[('standardize', CarDataStandardizer()), 
                                 ('augment',CarDataAugmentor())
                                 ])

In [None]:
in_data = [{
            "year" : 2014,
            "make" : "toyota",
            "model" : "corolla",
            "trim" : "le plus",
            "odometer" : 20700,
            "state" :  "AZ",
            "colorexterior" : "blue",
            "colorinterior" : "black",
            "accidenthist" : "n",
            "owner" : 5,
            "usage" : "personal"
            }]


df = pd.DataFrame.from_dict(in_data)
df4 = model_pipeline.transform(df)
df4

In [None]:
oh = OneHotTransformer()
oh.fit_transform(df4)

In [None]:
df4.columns

In [None]:
class OneHotTransformer(SklearnOneHotEncoder):
    
    def __init__(self):
        super(OneHotTransformer, self).__init__()
        print("Initialized OneHotTransformer") 
        self.fit_flag = False

    def fit(self, X):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X):
        sparse_matrix = super(OneHotTransformer, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        print(new_columns)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_{self.categories_[i][j]}')
                j += 1
        return new_columns

In [None]:
df_final = df4 [['ReliabilityRank', 'CostOfLivingRank', 'PercentSales', 'AvgDaysToTurn',
       'ReviewScore', 'AvgMPG', 'age', 'odo', 'owner_0', 'owner_1', 'owner_2',
       'usage_FLEET', 'usage_PERSONAL', 'LuxurySportsOrHybrid_N',
       'LuxurySportsOrHybrid_U', 'LuxurySportsOrHybrid_Y', 'drivetrain_AWD',
       'drivetrain_FWD', 'accidenthist_N', 'accidenthist_Y',
       'colorexterior_BLACK', 'colorexterior_BLUE', 'colorexterior_GRAY',
       'colorexterior_OTHER', 'colorexterior_RED', 'colorexterior_SILVER',
       'colorexterior_WHITE', 'colorinterior_BEIGE', 'colorinterior_BLACK',
       'colorinterior_GRAY', 'colorinterior_OTHER', 'bodytype_CONVERTIBLE',
       'bodytype_COUPE', 'bodytype_HATCHBACK', 'bodytype_PICKUP',
       'bodytype_SEDAN', 'bodytype_SUV', 'bodytype_TRUCK',
       'bodytype_VAN/MINIVAN', 'bodytype_WAGON']]

In [None]:
df_final = df4

In [None]:
model_pkl_file = "carprice_stack_model_v1.pkl"

with open(model_pkl_file, 'rb') as file:
    stack_model = pickle.load(file)
    


In [None]:
# Calculate the accuracy score and predict target values
y = stack_model.predict(df_final)
y