## Steps to convert raw web scraped data into ML model ready data

1. Get intial features as obtained from RAW file
2. Append to other datasets
3. Impute missing data
4. Convert to numericals
3. Use encoder to encode values
4. Use pandas get dummies to do onehot encoding for columns 
5. Convert to Vectors X and y

In [1]:
import pandas as pd
import json
import requests
import numpy as np
from fuzzywuzzy import fuzz
requests.packages.urllib3.disable_warnings() 
pd.set_option('display.max_colwidth',0)
pd.set_option('display.float_format',lambda x: '%.2f' %x)
pd.set_option('display.max_columns',0)
from datetime import datetime
import pickle
from pandas.api.types import is_numeric_dtype



In [2]:
numeric_features = ['ReliabilityRank', 'CostOfLivingRank', 'PercentSales', 'AvgDaysToTurn',
                    'ReviewScore', 'AvgMPG', 'age', 'odo']

cat_features = {
            'owner' : ['0', '1','2'],
            'usage' : ['FLEET','PERSONAL'],
            'LuxurySportsOrHybrid' : ['N','U','Y'],
            'drivetrain' : ['AWD', 'FWD'],
            'accidenthist' : ['N','Y'],
            'colorexterior' : ['BLACK', 'BLUE','GRAY','OTHER',  'RED',  'SILVER','WHITE'],
            'colorinterior' : ['BEIGE','BLACK', 'GRAY', 'OTHER'],
            'bodytype' :['CONVERTIBLE','COUPE', 'HATCHBACK', 'PICKUP','SEDAN', 'SUV', 'TRUCK','VAN/MINIVAN', 'WAGON']
            }

onehot_features = []
for x in cat_features.keys():
    onehot_features.extend([(x + '_' + v) for v in cat_features[x]])

final_column_set = numeric_features + onehot_features
    
mandatory_input_features = ['year', 'make', 'model', 'trim', 'odometer', 'state', 'colorexterior','colorinterior']

In [3]:
class CarDataStandardizer():
    
    def __init__(self):        
        print("Initialized standardizer")  
        
    def fit(self):
        return self
    
    def transform(self, df):
        # merge the data
        df_input = df.copy()
        
        for col in ['make','model','trim','state','colorexterior','colorinterior','accidenthist','usage']:
            df_input[col] = df_input[col].astype(str).apply(lambda x:x.upper().strip())           

        df_input['colorexterior'] = df_input['colorexterior'].apply(lambda x: x if x in ['WHITE', 'BLACK', 'SILVER', 'GRAY', 'BLUE', 'RED'] else 'OTHER')
        df_input['colorinterior'] = df_input['colorinterior'].apply(lambda x: x if x in ['BLACK', 'GRAY', 'BEIGE'] else 'OTHER')       
            
        df_input['age'] = datetime.now().year - df_input['year'].astype('int')
        df_input['odo'] = df_input['odometer'].astype('int') / 1000
        df_input = df_input.drop(columns = ['odometer'])
        
        df_input['owner'] = pd.cut(x=df_input['owner'], bins=[-1,0,1,20], labels=[0,1,2])
        
        return df_input

In [4]:
class CarDataAugmentor():
    
    def __init__(self):
        
        # prepare augment data from files
        print("Initialized CarDataAugmentor") 
        
        self.df_category = pd.read_csv('car_category.csv', index_col = False)

        self.df_reliability = pd.read_csv('car_reliability_rankings.csv', index_col = False)
        self.df_reliability = self.df_reliability[['Make','ReliabilityRank']]

        self.df_cost = pd.read_csv('statewise_economic_indicators.csv',index_col = False)
        self.df_cost = self.df_cost[['State','CostOfLivingRank']]

        self.df_sales = pd.read_csv('car_sales.csv',index_col = False)
        self.df_sales = self.df_sales.drop(columns = ['TotalSales'])

        self.df_turn = pd.read_csv("used_car_time_to_turn.csv")
        self.df_turn['AvgDaysToTurn'] = self.df_turn.mean(axis=1)
        self.df_turn['Make'] = self.df_turn['Make'].str.upper()
        self.df_turn = self.df_turn[['Make','AvgDaysToTurn']]


        self.df_ratings = pd.read_csv('car_ratings.csv', index_col = False)
        self.df_ratings.drop_duplicates(subset = ['MakeModel'], inplace = True)
        self.df_ratings['AvgMPG'] = (self.df_ratings['MpgCity']  + self.df_ratings['MpgHwy']) / 2
        self.df_ratings.loc[self.df_ratings['CarClass'].str.contains(r'LUXURY|SPORTS|HYBRID'),'LuxurySportsOrHybrid'] = 'Y'
        self.df_ratings['LuxurySportsOrHybrid']  = self.df_ratings['LuxurySportsOrHybrid'].fillna('N')
        self.df_ratings = self.df_ratings[['MakeModel','ReviewScore','AvgMPG','LuxurySportsOrHybrid']]
 
    def fit(self):
        return self

    def transform(self, df):
        # merge the data
        df_input = df.copy()
        
        df_input = df_input.merge(self.df_reliability,how = 'left',left_on='make', right_on='Make')
        df_input = df_input.drop(columns = ['Make'])
        
        df_input = df_input.merge(self.df_cost,how = 'left',left_on=['state'], right_on=['State'])
        df_input = df_input.drop(columns = ['State'])
 
        df_input = df_input.merge(self.df_sales,how = 'left',left_on=['make'], right_on=['Make'])
        df_input = df_input.drop(columns = ['Make'])
        
        df_input = df_input.merge(self.df_turn,how = 'left',left_on=['make'], right_on=['Make'])
        df_input = df_input.drop(columns = ['Make'])
        
        df_input = df_input.merge(self.df_category,how = 'left', 
                                  left_on = ['year','make','model'], 
                                  right_on = ['Year','Make','Model'])
        df_input = df_input.drop(columns = ['Year','Make','Model']).rename({'Category' : 'bodytype'},axis = 1)


        # Function to do fuzzy matching of make and model combination to get ratings
        def getclass(makemodel):

            try:
                matches = self.df_ratings['MakeModel'].apply(lambda x:fuzz.ratio(x,makemodel))
                if matches.max() > 80:
                    return matches.idxmax()
                else:
                    return -1
            except:
                return -1


        df_input['makemodel'] = df_input['make'] + ' ' + df_input['model'] 
        df_input['matchindex'] = df_input['makemodel'].apply(getclass)
        df_input = df_input.merge(self.df_ratings, how = 'left', left_on = 'matchindex', right_index = True)
        df_input = df_input.drop(columns = ['makemodel','matchindex','MakeModel'])

        def getdrivetrain(trim):

            try:
                drivetrain = [d for d in ['AWD','RWD','FWD','4WD','2WD'] if d in trim]
                drivetrain = drivetrain[0] if len(drivetrain) > 0 else 'FWD'
                drivetrain = 'FWD' if drivetrain in ['FWD','2WD'] else 'AWD'
                return drivetrain
            except:
                return 'FWD'

        df_input['drivetrain'] = df_input['trim'].apply(getdrivetrain)
        
        df_final = df_input.drop(columns = ['year', 'make', 'model', 'trim', 'state'])

        return df_final


In [5]:
class OneHotTransformer():
    
    def __init__(self):
        
        print("Initialized OneHotTransformer") 
        self.attlist = list(cat_features.keys())        
        self.att_columns = onehot_features
        
        
    def fit(self, df):
        return self

    def transform(self, df):
        
        df_input = df.copy()
        
        att_dummies = pd.get_dummies(df_input[self.attlist])
        att_dummies = att_dummies.reindex(columns = self.att_columns, fill_value=0)        
        df_final = pd.concat([df_input, att_dummies], axis=1)
        df_final.drop(columns = self.attlist, axis=1, inplace=True)        
        
    
        return df_final

In [6]:
class PreModelValidator():
    
    def __init__(self):
        
        print("Initialized PreModelValidator")         
        
    def fit(self, df):
        return self

    def transform(self, df):
        
        df_input = df.copy()
        
        assert set(df_input.columns) == set(final_column_set)      
        
        # Ensure no nulls
        assert df_input.isnull().values.any() == False, "Null Values exist in features"
        
       
        # Ensure No Non-Numeric
        assert is_numeric_dtype(df_input.values) == True, "Non-numeric Values exist in features"
    
    
        df_final = df_input[final_column_set]
    
        return df_final

In [7]:
from sklearn.pipeline import Pipeline
transformer_pipeline = Pipeline(steps=[('standardize', CarDataStandardizer()), 
                                         ('augment',CarDataAugmentor()),
                                         ('onehot', OneHotTransformer()),
                                         ('validate',PreModelValidator())
                                         ])

Initialized standardizer
Initialized CarDataAugmentor
Initialized OneHotTransformer
Initialized PreModelValidator


In [8]:
model_pkl_file = "carprice_stack_model_v1.pkl"

with open(model_pkl_file, 'rb') as file:
    stack_model = pickle.load(file)


## Using single record

In [8]:
in_data = [{
            "year" : 2014,
            "make" : "toyota",
            "model" : "corolla",
            "trim" : "le plus",
            "odometer" : 20700,
            "state" :  "AZ",
            "colorexterior" : "blue",
            "colorinterior" : "black",
            "accidenthist" : "n",
            "owner" : 5,
            "usage" : "personal"
            }]


df = pd.DataFrame.from_dict(in_data)
df4 = transformer_pipeline.transform(df)

In [12]:
df4

Unnamed: 0,ReliabilityRank,CostOfLivingRank,PercentSales,AvgDaysToTurn,ReviewScore,AvgMPG,age,odo,owner_0,owner_1,owner_2,usage_FLEET,usage_PERSONAL,LuxurySportsOrHybrid_N,LuxurySportsOrHybrid_U,LuxurySportsOrHybrid_Y,drivetrain_AWD,drivetrain_FWD,accidenthist_N,accidenthist_Y,colorexterior_BLACK,colorexterior_BLUE,colorexterior_GRAY,colorexterior_OTHER,colorexterior_RED,colorexterior_SILVER,colorexterior_WHITE,colorinterior_BEIGE,colorinterior_BLACK,colorinterior_GRAY,colorinterior_OTHER,bodytype_CONVERTIBLE,bodytype_COUPE,bodytype_HATCHBACK,bodytype_PICKUP,bodytype_SEDAN,bodytype_SUV,bodytype_TRUCK,bodytype_VAN/MINIVAN,bodytype_WAGON
0,5,29,12.19,43.23,7.9,32.0,6,20.7,0,0,1,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [11]:
# Calculate the accuracy score and predict target values
y = stack_model.predict(df4)
y

array([11198.571], dtype=float32)

## reading from csv

In [9]:
df = pd.read_csv('datasample.csv')

In [10]:
df4 = transformer_pipeline.transform(df)

In [11]:
df4

Unnamed: 0,ReliabilityRank,CostOfLivingRank,PercentSales,AvgDaysToTurn,ReviewScore,AvgMPG,age,odo,owner_0,owner_1,owner_2,usage_FLEET,usage_PERSONAL,LuxurySportsOrHybrid_N,LuxurySportsOrHybrid_U,LuxurySportsOrHybrid_Y,drivetrain_AWD,drivetrain_FWD,accidenthist_N,accidenthist_Y,colorexterior_BLACK,colorexterior_BLUE,colorexterior_GRAY,colorexterior_OTHER,colorexterior_RED,colorexterior_SILVER,colorexterior_WHITE,colorinterior_BEIGE,colorinterior_BLACK,colorinterior_GRAY,colorinterior_OTHER,bodytype_CONVERTIBLE,bodytype_COUPE,bodytype_HATCHBACK,bodytype_PICKUP,bodytype_SEDAN,bodytype_SUV,bodytype_TRUCK,bodytype_VAN/MINIVAN,bodytype_WAGON
0,5,29,12.19,43.23,7.9,32.0,6,20.7,0,0,1,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,18,49,8.34,48.69,8.1,34.0,5,35.64,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
y = stack_model.predict(df4)

In [13]:
y

array([11198.571, 14339.904], dtype=float32)

In [14]:
df4['predicted_price'] = y

In [15]:
df4

Unnamed: 0,ReliabilityRank,CostOfLivingRank,PercentSales,AvgDaysToTurn,ReviewScore,AvgMPG,age,odo,owner_0,owner_1,owner_2,usage_FLEET,usage_PERSONAL,LuxurySportsOrHybrid_N,LuxurySportsOrHybrid_U,LuxurySportsOrHybrid_Y,drivetrain_AWD,drivetrain_FWD,accidenthist_N,accidenthist_Y,colorexterior_BLACK,colorexterior_BLUE,colorexterior_GRAY,colorexterior_OTHER,colorexterior_RED,colorexterior_SILVER,colorexterior_WHITE,colorinterior_BEIGE,colorinterior_BLACK,colorinterior_GRAY,colorinterior_OTHER,bodytype_CONVERTIBLE,bodytype_COUPE,bodytype_HATCHBACK,bodytype_PICKUP,bodytype_SEDAN,bodytype_SUV,bodytype_TRUCK,bodytype_VAN/MINIVAN,bodytype_WAGON,price
0,5,29,12.19,43.23,7.9,32.0,6,20.7,0,0,1,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,11198.57
1,18,49,8.34,48.69,8.1,34.0,5,35.64,0,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,14339.9
