# Used Car Prices

### Problem Statement

The aim of this project is to create regression model to help the new car trader company determine the price of used cars.

### Evaluation Metric
Mean squared error (𝑀𝑆𝐸)

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import PowerTransformer

# Load the data
prediction_data = '2. Prepared Data/pred_cars.csv'
df = pd.read_csv(prediction_data)

# Load a model from a file
model_path = '5. Insights/Models/used_car_prices_model.pickle'
with open(model_path, 'rb') as f:
    regression_model_loaded = pickle.load(f)
print('Regression model {} is loaded.'.format(regression_model_loaded))

# Load the PorewTransformer object for the inverse target transformation
transformer_path = '5. Insights/Models/used_car_prices_target_transformation.pickle'
with open(transformer_path, 'rb') as f:
    transformer_loaded = pickle.load(f)
print('Transformer {} is loaded.'.format(transformer_loaded))

Regression model Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_power',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy='constant')),
                                                                  ('power',
                                                                   PowerTransformer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['odometer_value',
                                                   'odometer_value/year',
                                                   'duration_listed']),
                                                 ('num_qt',
      

### Preprocess the data

In [2]:
# Create 'name' variable to combine manufacture and model names
columns_strip = ['manufacturer_name', 'model_name']
# Delete extra space in strings
for column in columns_strip:
    df[column] = df[column].apply(lambda x: x.strip())
# Combine manufacture and model names    
df['name'] = df['manufacturer_name'] + ' ' + df['model_name']

# Create a feature that represents mileage per year
df['odometer_value/year'] = round(df['odometer_value']/(2020 - df['year_produced']))
# Create a feature how old is a car
df['year'] = 2020 - df['year_produced']

# Reduce the number of car model names
# Set a limit of rare car occurrence
car_total = 6
# Count a number of car names and convert the result to a dataframe
car_models = pd.DataFrame(df['name'].value_counts())
# Get a list of rare car names
car_models = car_models[car_models['name'] < car_total].index
# create a new category'other' for rare car model names
df['name'] = df['name'].apply(lambda x: 'other' if x in car_models else x)

# Create features to reduce a number of categories
hybrid ='hybrid_or_electric'
df['engine_fuel'] = df['engine_fuel'].replace({'hybrid-petrol':hybrid,'hybrid-diesel':hybrid,'electric':hybrid})

# Create a list of unnamed features
features_list = ['feature_0','feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']
# Count a total number of unnamed features for a car
df['other_features']=df[features_list].sum(axis=1)

In [3]:
# Define predictor variables
features =[ 'manufacturer_name', 'has_warranty', 'state', 'drivetrain', 'transmission', 'name',
           'odometer_value', 'odometer_value/year', 'year',  'engine_fuel','color',
           'duration_listed', 'body_type', 'engine_capacity', 'other_features', 'feature_0',
           ]

### Prediction

In [4]:
# Make prediction using the pipeline
prediction = regression_model_loaded.predict(df[features])
prediction

array([-0.98057763, -0.79509045, -0.0357562 , ..., -0.36707498,
        1.38403971, -0.71307949])

In [5]:
# Transform predicted price to get the rounded car price in dollars
y_predict_price = transformer_loaded.inverse_transform(prediction.reshape(-1,1))
# Round car price to hundred
y_predict_price_round = np.round(y_predict_price,-2)
# Create a dataframe with results
results = pd.DataFrame( {'Predicted':y_predict_price.reshape(-1),
                         'Predicted rounded':y_predict_price_round.reshape(-1)},
                          index=df.index)

# Form a dataframe with car information and predicted prices
prediction_results = df.join(results)

# Save car information and predicted prices to csv file
fl = "5. Insights/Prediction/used_car_prices_prediction_data_predicted_price.csv"
prediction_results.to_csv(fl, index=False)

# Save predicted prices to csv file
fl = "5. Insights/Prediction/used_car_prices_predicted_price.csv"
results.to_csv(fl, index=False)

# Predicted car prices
results

Unnamed: 0,Predicted,Predicted rounded
0,1630.153931,1600.0
1,2016.545408,2000.0
2,4452.182328,4500.0
3,10044.839076,10000.0
4,5509.451315,5500.0
...,...,...
7702,2574.606131,2600.0
7703,17271.026202,17300.0
7704,3196.767203,3200.0
7705,15155.154055,15200.0
