In [1]:
import pandas as pd
import numpy as np
import sklearn
import joblib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [3]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [4]:
print("pandas.__version__", pd.__version__, flush=True)
print("numpy.__version__", np.__version__, flush=True)
print("sklearn.__version__", sklearn.__version__, flush=True)
print("joblib.__version__", joblib.__version__, flush=True)

pandas.__version__ 1.0.3
numpy.__version__ 1.19.2
sklearn.__version__ 0.23.2
joblib.__version__ 0.17.0


In [5]:
pipeline_file_name = "Used_Car_Price_Pipeline_SVR.pkl"

# Load Data and Select Features

In [6]:
# Read the raw data
raw_df = pd.read_csv("clean-testing-data.csv") 
raw_df

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,lat,long,posting_date
0,7316356412,auburn,15000,2013.0,ford,f-150 xlt,excellent,6.0,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al,32.592000,-85.518900,2021-05-03T14:02:03-0500
1,7314560853,auburn,19900,2004.0,ford,f250 super duty,good,8.0,diesel,88000.0,clean,automatic,4wd,full-size,pickup,blue,al,32.547500,-85.468200,2021-04-29T17:19:18-0500
2,7313406529,auburn,14000,2012.0,honda,odyssey,excellent,6.0,gas,95000.0,clean,automatic,fwd,full-size,mini-van,silver,al,32.628739,-85.461820,2021-04-27T12:20:01-0500
3,7312847466,auburn,22500,2001.0,ford,f450,good,8.0,diesel,144700.0,clean,manual,rwd,full-size,truck,white,al,32.630400,-85.401600,2021-04-26T11:15:36-0500
4,7312144944,auburn,15000,2017.0,dodge,charger rt 4dr sedan,excellent,8.0,gas,90000.0,rebuilt,automatic,rwd,mid-size,sedan,grey,al,32.822400,-85.770400,2021-04-24T18:39:59-0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71901,7303112347,wyoming,15999,2018.0,chevrolet,"cruze, lt",excellent,4.0,gas,36465.0,clean,automatic,fwd,mid-size,sedan,black,wy,41.138284,-104.784799,2021-04-07T09:03:17-0600
71902,7302963673,wyoming,18000,2005.0,chevrolet,silverado 1500 lt 4x4,excellent,8.0,gas,130000.0,lien,automatic,4wd,full-size,truck,blue,wy,43.452800,-110.739300,2021-04-06T21:04:03-0600
71903,7302963273,wyoming,18000,1990.0,jeep,gand wagoneer,good,8.0,gas,114400.0,clean,automatic,4wd,full-size,SUV,black,wy,43.452800,-110.739300,2021-04-06T21:02:26-0600
71904,7302384818,wyoming,9800,1985.0,nissan,300zx coupe with t-tops,like new,6.0,gas,115000.0,clean,automatic,rwd,sub-compact,hatchback,red,wy,41.143700,-104.796200,2021-04-05T18:10:52-0600


# Process Data for Machine Learning

In [7]:
training_df = raw_df.set_index("id")

In [8]:
def simple_category_map(df, column, category_map):
    if (not pd.api.types.is_numeric_dtype(df[column])):
        df[column] = df[column].map(category_map)
    print(f"Column {column}: {category_map}")
    print(df[column].describe())
    print("")
    return df 

In [9]:
# convert condition into numbers
from mapping_values import vehicle_condition_map
training_df = simple_category_map(training_df, "condition", vehicle_condition_map)

# convert size into numbers
from mapping_values import vehicle_size_map
training_df = simple_category_map(training_df, "size", vehicle_size_map)

Column condition: {'new': 1, 'like new': 2, 'excellent': 3, 'good': 4, 'fair': 5, 'salvage': 6}
count    71906.000000
mean         3.260006
std          0.752632
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: condition, dtype: float64

Column size: {'compact': 1, 'sub-compact': 2, 'mid-size': 3, 'full-size': 4}
count    71906.000000
mean         3.296206
std          0.990295
min          1.000000
25%          3.000000
50%          4.000000
75%          4.000000
max          4.000000
Name: size, dtype: float64



In [10]:
# convert drive into category
if ("fwd" not in training_df.columns or "rwd" not in training_df.columns):      
    fwd = (training_df["drive"] == "fwd") | (training_df["drive"] == "4wd")
    rwd = (training_df["drive"] == "rwd") | (training_df["drive"] == "4wd")
    training_df["fwd"] = (fwd * 1)
    training_df["rwd"] = (rwd * 1)
    
if ("drive" in training_df.columns):    
    training_df = training_df.drop(["drive"], axis=1)

print(f"Column fwd")
print(training_df["fwd"].describe())
print("")
print(f"Column rwd")
print(training_df["rwd"].describe())

Column fwd
count    71906.000000
mean         0.779671
std          0.414472
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: fwd, dtype: float64

Column rwd
count    71906.000000
mean         0.633911
std          0.481738
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: rwd, dtype: float64


In [11]:
print(list(training_df.columns))
training_df

['region', 'price', 'year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status', 'transmission', 'size', 'type', 'paint_color', 'state', 'lat', 'long', 'posting_date', 'fwd', 'rwd']


Unnamed: 0_level_0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,size,type,paint_color,state,lat,long,posting_date,fwd,rwd
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7316356412,auburn,15000,2013.0,ford,f-150 xlt,3,6.0,gas,128000.0,clean,automatic,4,truck,black,al,32.592000,-85.518900,2021-05-03T14:02:03-0500,0,1
7314560853,auburn,19900,2004.0,ford,f250 super duty,4,8.0,diesel,88000.0,clean,automatic,4,pickup,blue,al,32.547500,-85.468200,2021-04-29T17:19:18-0500,1,1
7313406529,auburn,14000,2012.0,honda,odyssey,3,6.0,gas,95000.0,clean,automatic,4,mini-van,silver,al,32.628739,-85.461820,2021-04-27T12:20:01-0500,1,0
7312847466,auburn,22500,2001.0,ford,f450,4,8.0,diesel,144700.0,clean,manual,4,truck,white,al,32.630400,-85.401600,2021-04-26T11:15:36-0500,0,1
7312144944,auburn,15000,2017.0,dodge,charger rt 4dr sedan,3,8.0,gas,90000.0,rebuilt,automatic,3,sedan,grey,al,32.822400,-85.770400,2021-04-24T18:39:59-0500,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7303112347,wyoming,15999,2018.0,chevrolet,"cruze, lt",3,4.0,gas,36465.0,clean,automatic,3,sedan,black,wy,41.138284,-104.784799,2021-04-07T09:03:17-0600,1,0
7302963673,wyoming,18000,2005.0,chevrolet,silverado 1500 lt 4x4,3,8.0,gas,130000.0,lien,automatic,4,truck,blue,wy,43.452800,-110.739300,2021-04-06T21:04:03-0600,1,1
7302963273,wyoming,18000,1990.0,jeep,gand wagoneer,4,8.0,gas,114400.0,clean,automatic,4,SUV,black,wy,43.452800,-110.739300,2021-04-06T21:02:26-0600,1,1
7302384818,wyoming,9800,1985.0,nissan,300zx coupe with t-tops,2,6.0,gas,115000.0,clean,automatic,2,hatchback,red,wy,41.143700,-104.796200,2021-04-05T18:10:52-0600,0,1


# Select Training Features

In [12]:
# Scale the training data to better train
def scale_data(y, y_train, y_test):
    '''
        return the scaled training and testing data with the scaler
    '''
    y_scaler = MinMaxScaler().fit(y)
    # apply the scale to training and testing data sets
    y_train_scaled = y_scaler.transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)
    #
    return y_train_scaled, y_test_scaled, y_scaler

In [13]:
# shuffle our data to remove any potential bias
training_df = sklearn.utils.shuffle(training_df)

In [14]:
# Select the features to train the data
target_feature = "price"

# no nulls for the selected features
selected_features = [
#     "id",
#     "region",
#     "price",
    "year",
#     "manufacturer",
#     "model",
    "condition",
#     "cylinders",
#     "fuel",
    "odometer",
#     "title_status",
    "transmission",
    "size",
    "type",
#     "paint_color",
    "state",
#     "lat",
#     "long",
#     "posting_date",
    "fwd",
    "rwd",
]

# just in case we left in the selected feature
if target_feature in selected_features:
    selected_features.remove(target_feature)
    
# just in case we loose a feature
for feature in selected_features:
    if (feature not in training_df.columns):
        selected_features.remove(feature)
        print(f"Feature '{feature}' was dropped because it was missing from training dataframe!")

X = training_df[selected_features]
y = training_df[target_feature].values.ravel()
print(X.shape, y.shape)

(71906, 9) (71906,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("train\t", X_train.shape, y_train.shape)
print("test\t", X_test.shape, y_test.shape)

train	 (53929, 9) (53929,)
test	 (17977, 9) (17977,)


# Select Column Transformer

In [16]:
one_hot_encoding_columns = [
#     "id",
#     "region",
#     "price",
#     "year",
#     "manufacturer",
#     "model",
#     "condition",
#     "cylinders",
#     "fuel",
#     "odometer",
#     "title_status",
    "transmission",
#     "size",
    "type",
#     "paint_color",
    "state",
#     "lat",
#     "long",
#     "posting_date",
#     "fwd",
#     "rwd",
]

min_max_columns = [
#     "id",
#     "region",
#     "price",
    "year",
#     "manufacturer",
#     "model",
    "condition",
#     "cylinders",
#     "fuel",
    "odometer",
#     "title_status",
#     "transmission",
    "size",
#     "type",
#     "paint_color",
#     "state",
#     "lat",
#     "long",
#     "posting_date",
#     "fwd",
#     "rwd",
]

column_transformer = make_column_transformer(
    (
        OneHotEncoder(
            categories="auto",
            handle_unknown="ignore"
        ), 
        one_hot_encoding_columns
    ),
    (MinMaxScaler(), min_max_columns),
    remainder="passthrough",
    n_jobs=-1,
    verbose=True,
)
column_transformer

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['transmission', 'type', 'state']),
                                ('minmaxscaler', MinMaxScaler(),
                                 ['year', 'condition', 'odometer', 'size'])],
                  verbose=True)

# Select Model

In [17]:
clf = SVR(
    kernel="linear", 
#     degree=3,
    gamma='scale',
#     coef0=0.0,
    tol=0.001,
    C=1.0,
#     epsilon=0.1,
#     shrinking=True,
    cache_size=200,
    verbose=True,
#     max_iter=-1,
)
clf

SVR(kernel='linear', verbose=True)

# Create Deep Learning Pipeline

In [18]:
pipeline = make_pipeline(column_transformer, clf, verbose=True)
pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['transmission', 'type',
                                                   'state']),
                                                 ('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['year', 'condition',
                                                   'odometer', 'size'])],
                                   verbose=True)),
                ('svr', SVR(kernel='linear', verbose=True))],
         verbose=True)

In [19]:
pipeline.fit(X_train, y_train)

[Pipeline] . (step 1 of 2) Processing columntransformer, total=   3.4s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing svr, total= 2.9min


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['transmission', 'type',
                                                   'state']),
                                                 ('minmaxscaler',
                                                  MinMaxScaler(),
                                                  ['year', 'condition',
                                                   'odometer', 'size'])],
                                   verbose=True)),
                ('svr', SVR(kernel='linear', verbose=True))],
         verbose=True)

# Score Deep Learning Pipeline

In [20]:
pipeline.score(X_test, y_test)

0.22195677477270048

# Save Deep Learning Pipeline

In [21]:
joblib.dump(pipeline, pipeline_file_name)

['Used_Car_Price_Pipeline_SVR.pkl']

In [22]:
print("When you send in the data to predict the value, use these columns in this order!")
print(list(X.columns.values))

When you send in the data to predict the value, use these columns in this order!
['year', 'condition', 'odometer', 'transmission', 'size', 'type', 'state', 'fwd', 'rwd']


# Testing Loading the Model

In [23]:
def submit_model(vehicle_as_dict, sorted_columns, pipeline_file):
    # set the first value as a list
    for key in vehicle_as_dict:
        vehicle_as_dict[key] = [vehicle_as_dict[key]]
        break
    # create the the dataframe
    testing_vehicle = pd.DataFrame.from_dict(vehicle_as_dict, orient='columns')
    # sorting columns into order
    testing_vehicle = testing_vehicle[sorted_columns]
    # load the model from disk
    testing_pipeline = joblib.load(pipeline_file)
    vehicle_testing_results = testing_pipeline.predict(testing_vehicle)
    testing_result = round(vehicle_testing_results[0], 2)
    print("testing_result: ", testing_result, flush=True)
    return testing_result

In [24]:
print(submit_model(X.iloc[-1].to_dict(), X.columns.values, pipeline_file_name))

testing_result:  12171.66
12171.66


In [25]:
print(submit_model(X.iloc[-2].to_dict(), X.columns.values, pipeline_file_name))

testing_result:  11600.99
11600.99


In [26]:
print(submit_model(X.iloc[-3].to_dict(), X.columns.values, pipeline_file_name))

testing_result:  6778.8
6778.8
