# Import Libraries

In [1]:
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Read in Modified Carmax Data 

In [2]:
 # Create variable for csv file path
carmax_data_csv_path = "Carmax - Main (Modified).csv" 
# Read in csv file with path variable 
carmax_data = pd.read_csv(carmax_data_csv_path, index_col=0) 
# Create copy of df as new variable  
carmax_df = carmax_data.copy() 
carmax_df

Unnamed: 0,insert_num,purchase_make,purchase_model,purchase_vehicle_year,purchase_price,purchase_price_val,trade_in,vehicle_financing,customer_age,customer_age_val,customer_income,customer_income_val,customer_gender,customer_previous_purchase,customer_distance_to_dealer,post_purchase_satisfaction,vehicle_warranty_used,subsequent_purchases
0,81690,DODGE,CHARGER,2012,15001 - 20000,17501,True,True,21 - 30,25,40001 - 60000,50001,U,False,8.0,0.0,False,1
1,109994,FORD,F150,2007,15001 - 20000,17501,False,False,51 - 60,55,0 - 20000,10000,F,True,19.0,0.0,False,0
2,11399,BMW,328,2010,25001 - 30000,27501,True,True,41 - 50,45,60001 - 80000,70001,F,True,21.0,0.0,False,0
3,214156,LEXUS,GS 300,2003,10001 - 15000,12501,False,True,21 - 30,25,20001 - 40000,30001,M,False,8.0,0.0,False,0
4,36685,CHEVROLET,CRUZE,2012,15001 - 20000,17501,True,True,31 - 40,35,120001 - 140000,130001,M,True,5.0,0.0,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355869,195314,JEEP,LIBERTY,2010,15001 - 20000,17501,False,True,21 - 30,25,20001 - 40000,30001,F,False,13.0,0.0,False,0
355870,58219,CHEVROLET,SONIC,2012,10001 - 15000,12501,True,False,71 - 80,75,?,0,U,False,2.0,0.0,False,0
355871,207386,KIA,SORENTO,2012,15001 - 20000,17501,True,True,61 - 70,65,20001 - 40000,30001,F,True,9.0,0.0,False,1
355872,252391,NISSAN,350Z,2003,10001 - 15000,12501,False,True,21 - 30,25,40001 - 60000,50001,M,False,5.0,0.0,False,0


# Data Quality Validation

In [3]:
carmax_df.dtypes # Print data types for all column values

insert_num                       int64
purchase_make                   object
purchase_model                  object
purchase_vehicle_year            int64
purchase_price                  object
purchase_price_val               int64
trade_in                          bool
vehicle_financing                 bool
customer_age                    object
customer_age_val                 int64
customer_income                 object
customer_income_val              int64
customer_gender                 object
customer_previous_purchase        bool
customer_distance_to_dealer    float64
post_purchase_satisfaction     float64
vehicle_warranty_used             bool
subsequent_purchases             int64
dtype: object

In [4]:
carmax_df.isnull().sum() # Print the number of NaN values in each column  

insert_num                     0
purchase_make                  0
purchase_model                 0
purchase_vehicle_year          0
purchase_price                 0
purchase_price_val             0
trade_in                       0
vehicle_financing              0
customer_age                   0
customer_age_val               0
customer_income                0
customer_income_val            0
customer_gender                0
customer_previous_purchase     0
customer_distance_to_dealer    0
post_purchase_satisfaction     0
vehicle_warranty_used          0
subsequent_purchases           0
dtype: int64

In [5]:
carmax_df.columns.tolist() # Create list of all columns

['insert_num',
 'purchase_make',
 'purchase_model',
 'purchase_vehicle_year',
 'purchase_price',
 'purchase_price_val',
 'trade_in',
 'vehicle_financing',
 'customer_age',
 'customer_age_val',
 'customer_income',
 'customer_income_val',
 'customer_gender',
 'customer_previous_purchase',
 'customer_distance_to_dealer',
 'post_purchase_satisfaction',
 'vehicle_warranty_used',
 'subsequent_purchases']

# Data Exploration 

In [6]:
carmax_df.describe() # Summary statistics 

Unnamed: 0,insert_num,purchase_vehicle_year,purchase_price_val,customer_age_val,customer_income_val,customer_distance_to_dealer,post_purchase_satisfaction,subsequent_purchases
count,355874.0,355874.0,355874.0,355874.0,355874.0,355874.0,355874.0,355874.0
mean,177937.5,2009.527867,19241.313979,41.098366,50964.09747,15.227524,0.031373,0.533416
std,102732.119186,2.491927,7425.522232,14.383603,39588.322449,64.866575,0.546482,1.190327
min,1.0,1953.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,88969.25,2008.0,12501.0,25.0,30001.0,5.0,0.0,0.0
50%,177937.5,2010.0,17501.0,35.0,50001.0,8.0,0.0,0.0
75%,266905.75,2011.0,22501.0,55.0,70001.0,14.0,0.0,1.0
max,355874.0,2014.0,92501.0,95.0,190001.0,2455.0,10.0,10.0


# Select Data for ML Model

In [7]:
y = carmax_df.purchase_price_val # Create target object as y - (purchase price)

In [8]:
features_list = ['customer_age_val', 'customer_income_val'] # Create list of model features - (customer age, customer income) 

In [9]:
X = carmax_df[features_list] # Create df with columns from features list as X 

In [10]:
X.describe() # Summary statistics for X

Unnamed: 0,customer_age_val,customer_income_val
count,355874.0,355874.0
mean,41.098366,50964.09747
std,14.383603,39588.322449
min,0.0,0.0
25%,25.0,30001.0
50%,35.0,50001.0
75%,55.0,70001.0
max,95.0,190001.0


In [11]:
X.head(10) # Print first ten records from X

Unnamed: 0,customer_age_val,customer_income_val
0,25,50001
1,55,10000
2,45,70001
3,25,30001
4,35,130001
5,25,70001
6,25,30001
7,65,90001
8,45,50001
9,55,150001


# 1. Build Machine Learning Model

In [12]:
# Define ML model - Specify a number for random_state to generate same results each run
ml_model = DecisionTreeRegressor(random_state=0)

# Fit ML model - features (X), prediction object (y)
ml_model.fit(X,y)

DecisionTreeRegressor(random_state=0)

In [13]:
# Print the prediction values for the first 5 records 
print("Predictions for the following 5 purchase price values:")
print(X.head())
print("Predictions List:")
print(ml_model.predict(X.head()))

Predictions for the following 5 purchase price values:
   customer_age_val  customer_income_val
0                25                50001
1                55                10000
2                45                70001
3                25                30001
4                35               130001
Predictions List:
[17777.65317139 18629.24516885 20105.51115068 15398.17611554
 23905.22467595]


# 2. Build Machine Learning Model

In [14]:
# Generate prediction values for X
predicted_purchase_prices = ml_model.predict(X)

# Calculate MAE in-sample score - predicted purchase price values and actual purchase price values: error=actual-predicted
mean_absolute_error(y, predicted_purchase_prices) 

5080.152046797236

In [15]:
# split data into training and validation data, for both features and target, based on a random number generator. 
# Assign the numeric value to the random_state argument ensures the same split each time we run this script  
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# Define ML Model 
ml_model = DecisionTreeRegressor()

# Fit ML model to training data
ml_model.fit(train_X, train_y) 

# Generate predicted purchase prices on validation data 
val_predictions = ml_model.predict(val_X)

print(mean_absolute_error(val_y, val_predictions)) 

5100.678635542901


# 3. Build Machine Learning Model

In [16]:
# Function that compares MAE scores from different values for max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae) 

In [17]:
# Create list for candidate max leaf nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# For loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {max_leaf_nodes: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) for max_leaf_nodes in candidate_max_leaf_nodes} 

# Assign new variable to the best value of max_leaf_nodes
best_tree_size = min(scores, key=scores.get) 

In [18]:
# Fit the final ML model to it's optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=250, random_state=0)

# 4. Build Random Forest ML Model

In [19]:
# Define RF model
rf_model = RandomForestRegressor()

# Fit RF model to training data 
rf_model.fit(train_X, train_y)

# Calculate MAE on validation data in RF model 
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, rf_val_predictions)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae)) 

Validation MAE for Random Forest Model: 5100.390667540491


In [20]:
# Create new RF model to train on all training data
rf_model_on_full_data = RandomForestRegressor()

# Fit RF model to all training data   
rf_model_on_full_data.fit(train_X, train_y)

RandomForestRegressor()

In [21]:
# Create test_X which comes from X but includes only the feature columns used for prediction.
test_X = X[features_list]

# Generate prediction values on full data
test_preds = rf_model_on_full_data.predict(test_X) 
test_preds

array([17773.42031833, 18616.84213721, 20086.73227931, ...,
       17154.21861913, 17773.42031833, 21164.76968694])

In [22]:
# Print the prediction values for the first 5 records 
print("Predictions for the following 5 purchase price values:")
print(test_X.head())
print("Predictions List:")
print(rf_model.predict(test_X.head()))

Predictions for the following 5 purchase price values:
   customer_age_val  customer_income_val
0                25                50001
1                55                10000
2                45                70001
3                25                30001
4                35               130001
Predictions List:
[17775.94318939 18597.3798121  20093.42729793 15408.96853222
 24101.74200809]


# Output prediction results to csv file

In [23]:
output_df = pd.DataFrame(
    {'customer_age_val': test_X.customer_age_val,
    'customer_income_val': test_X.customer_income_val,
    'purchase_price_val': test_preds}
) 

output_df.to_csv("Carmax - ML Results (Output).csv", index=False) 