In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

# Load CSV file
url = "car_listing_combined.csv"
df = pd.read_csv(url)



In [2]:
df.dtypes

Price           object
Year            object
Make            object
Model           object
Mileage         object
Transmission    object
Fuel Type       object
Body Type       object
Color           object
Engine Size     object
NCT Expiry      object
Road Tax        object
dtype: object

In [3]:
df['Price'] = df['Price'].str.replace('€', '')
df['Price'] = df['Price'].str.replace('£', '')
df['Price'] = df['Price'].str.replace(',', '')
df['Price'] = df['Price'].str.replace('.', '')
df['Price'] = df['Price'].str.replace('No Price', '0')
df.Price.str.extract('(\d+)')
df.Year.str.extract('(\d+)')
df['Engine Size'] = df['Engine Size'].str.replace('litre', '')
df.head()

  df['Price'] = df['Price'].str.replace('.', '')


Unnamed: 0,Price,Year,Make,Model,Mileage,Transmission,Fuel Type,Body Type,Color,Engine Size,NCT Expiry,Road Tax
0,0,2009,Audi,A6,"189,000 km",---,Diesel,Saloon,Black,2.0,Jan-24,280
1,0,2013,Vauxhall,Insignia,"207,000 km",Unknown,Diesel,Hatchback,White,2.0,Nov-23,200
2,0,2016,Other,Other,"24,500 km",Automatic,Diesel,Saloon,Black,2.1,Jan-24,---
3,0,2011,Volkswagen,Golf,"350,000 km",Manual,Diesel,Hatchback,Blue,1.6,Jun-23,200
4,0,2015,BMW,5-Series,"260,000 km",Automatic,Diesel,Saloon,White,2.0,Nov-23,190


In [4]:
df = df[df['Price'].notna()]
df = df[df['Year'].notna()]
df = df[df['Engine Size'].notna()]
df = df[df['Road Tax'].notna()]

In [5]:
print(df)

      Price  Year        Make        Model     Mileage Transmission Fuel Type  \
0         0  2009        Audi           A6  189,000 km          ---    Diesel   
1         0  2013    Vauxhall     Insignia  207,000 km      Unknown    Diesel   
2         0  2016       Other        Other   24,500 km    Automatic    Diesel   
3         0  2011  Volkswagen         Golf  350,000 km       Manual    Diesel   
4         0  2015         BMW     5-Series  260,000 km    Automatic    Diesel   
...     ...   ...         ...          ...         ...          ...       ...   
68393     0  2015  Land Rover  Range Rover         ---    Automatic    Diesel   
68394     0  2023     Peugeot         3008        0 km       Manual    Diesel   
68395     0  2012  Volkswagen         Golf         ---          ---       ---   
68396     0  2023     Peugeot       RIFTER        8 km       Manual    Diesel   
68397     0  2023     Peugeot       RIFTER        2 km       Manual    Diesel   

       Body Type  Color Eng

In [6]:
def convert_to_km(df, Mileage):
    """
    Converts a column with mixed units of kilometers and miles to kilometers.

    Args:
        df (pandas.DataFrame): The dataframe containing the column to convert.
        column_name (str): The name of the column to convert.

    Returns:
        pandas.Series: The converted column as a pandas Series.
    """
    series = df[Mileage].copy()  # create a copy of the column to avoid modifying the original dataframe

    # loop over each element in the column
    for i, val in series.items():
        if isinstance(val, str):
            if val.endswith('km'):  # check if the value is in kilometers
                km = float(val[:-2].replace(',', ''))  # extract the numeric value of kilometers and remove commas
            elif val.endswith('mi'):  # check if the value is in miles
                miles = float(val[:-2].replace(',', ''))  # extract the numeric value of the miles and remove commas
                km = miles * 1.60934  # convert miles to kilometers
            else:
                continue  # skip the cell if it's not in km or mi format
        else:
            km = val  # leave the value unchanged if it's already in kilometers

        series[i] = km  # update the value in the series

    return series


In [7]:
df["Mileage"] = convert_to_km(df, 'Mileage')

In [8]:
df.head()

Unnamed: 0,Price,Year,Make,Model,Mileage,Transmission,Fuel Type,Body Type,Color,Engine Size,NCT Expiry,Road Tax
0,0,2009,Audi,A6,189000.0,---,Diesel,Saloon,Black,2.0,Jan-24,280
1,0,2013,Vauxhall,Insignia,207000.0,Unknown,Diesel,Hatchback,White,2.0,Nov-23,200
2,0,2016,Other,Other,24500.0,Automatic,Diesel,Saloon,Black,2.1,Jan-24,---
3,0,2011,Volkswagen,Golf,350000.0,Manual,Diesel,Hatchback,Blue,1.6,Jun-23,200
4,0,2015,BMW,5-Series,260000.0,Automatic,Diesel,Saloon,White,2.0,Nov-23,190


In [9]:
# replace '---' with NaN in 'Price' and 'Year' columns
#df['Price'] = df['Price'].replace('---', np.nan)
#df['Year'] = df['Year'].replace('---', np.nan)
df = df.replace('---', np.nan)

# convert the 'Price' and 'Year' columns to float
df[['Price', 'Year']] = df[['Price', 'Year']].astype(float)


In [10]:
df.dropna(inplace=True)

# drop rows with 0 values
df = df[(df != 0).all(1)]
print(df)

        Price    Year        Make    Model     Mileage Transmission  \
82        1.0  2005.0       Other    Other   83000.000      Unknown   
83        1.0  2007.0        Opel    Astra     185.000       Manual   
84        1.0  2008.0     Renault   Laguna     215.000       Manual   
85        1.0  2009.0        Opel   Antara     167.000       Manual   
87        1.0  2012.0        Ford    Focus  263931.760       Manual   
...       ...     ...         ...      ...         ...          ...   
65175  9000.0  2013.0      Toyota    Prius  235000.000    Automatic   
65192  9450.0  2015.0  Volkswagen    Caddy  165762.020       Manual   
65215  9900.0  2017.0      Nissan  Qashqai  197466.018       Manual   
65228  9995.0  2012.0       Honda     CR-V  122309.840    Automatic   
65235  9995.0  2016.0       Honda     Jazz   24140.100       Manual   

           Fuel Type  Body Type   Color Engine Size NCT Expiry Road Tax  
82            Petrol     Saloon   Black        2.0      Mar-21      710  

In [11]:
df[['Engine Size','Road Tax']] = df[['Engine Size', 'Road Tax']].astype(float)

In [12]:
dfx = df[['Price','Year','Make','Model','Transmission','Fuel Type','Body Type','Engine Size','Road Tax','Mileage']]
dfx.dropna(inplace=True)
print(dfx)

        Price    Year        Make    Model Transmission      Fuel Type  \
82        1.0  2005.0       Other    Other      Unknown         Petrol   
83        1.0  2007.0        Opel    Astra       Manual         Petrol   
84        1.0  2008.0     Renault   Laguna       Manual         Diesel   
85        1.0  2009.0        Opel   Antara       Manual         Diesel   
87        1.0  2012.0        Ford    Focus       Manual         Diesel   
...       ...     ...         ...      ...          ...            ...   
65175  9000.0  2013.0      Toyota    Prius    Automatic  Petrol Hybrid   
65192  9450.0  2015.0  Volkswagen    Caddy       Manual         Diesel   
65215  9900.0  2017.0      Nissan  Qashqai       Manual         Diesel   
65228  9995.0  2012.0       Honda     CR-V    Automatic         Diesel   
65235  9995.0  2016.0       Honda     Jazz       Manual         Petrol   

       Body Type  Engine Size  Road Tax     Mileage  
82        Saloon          2.0     710.0   83000.000  
83 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [13]:
dfx = dfx.drop(dfx[dfx.Mileage <= 5000].index)
dfx = dfx.drop(dfx[dfx.Price <= 1234].index)
dfx = dfx.drop(dfx[dfx.Price >= 100000].index)
#dfx["Usage"] = dfx["Mileage"] / (2023 - dfx["Year"])
dfx.corr()


Unnamed: 0,Price,Year,Engine Size,Road Tax,Mileage
Price,1.0,0.198726,0.322364,0.042304,-0.007849
Year,0.198726,1.0,-0.036878,-0.09894,-0.002297
Engine Size,0.322364,-0.036878,1.0,0.497711,0.000492
Road Tax,0.042304,-0.09894,0.497711,1.0,0.00149
Mileage,-0.007849,-0.002297,0.000492,0.00149,1.0


In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import numpy as np


# Preprocessing: Fill missing values and handle infinity values
for column in dfx.columns:
    if dfx[column].dtype == np.float64 or dfx[column].dtype == np.int64:
        dfx[column].fillna(dfx[column].mean(), inplace=True)
        dfx[column] = np.where(np.isinf(dfx[column]), dfx[column].mean(), dfx[column])
    else:
        dfx[column].fillna(dfx[column].mode()[0], inplace=True)

# Encode categorical features using label encoding
categorical_columns = ['Make', 'Model', 'Transmission', 'Fuel Type', 'Body Type']
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    dfx[column] = le.fit_transform(dfx[column])
    label_encoders[column] = le

# Split the data into training and testing sets
X = dfx.drop('Price', axis=1)
y = dfx['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
X_train = X_train.dropna()
y_train = y_train.dropna()

In [16]:
X_train = X_train.astype(np.float64)
y_train = y_train.astype(np.float64)

In [17]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(n_estimators=25, random_state=42)
rfc.fit(X_train, y_train)

# Step 5: Save the model, input data, and encoders for future use
joblib.dump(rfc, 'rfc_model3.pkl')
joblib.dump(label_encoders, 'label_encoders3.pkl')
dfx.to_csv('input_data3.csv', index=False)

# Step 6: Load the model, input data, and encoders, and test the model
loaded_rfc = joblib.load('rfc_model3.pkl')
loaded_label_encoders = joblib.load('label_encoders3.pkl')
loaded_dfx = pd.read_csv('input_data3.csv')

# Prepare test data (assuming it's in the same format as the input data)
test_data = loaded_dfx.drop('Price', axis=1)
test_labels = loaded_dfx['Price']





In [18]:
# Test the model
predictions = loaded_rfc.predict(test_data)

accuracy = accuracy_score(test_labels, predictions)
confusion_mat = confusion_matrix(test_labels, predictions)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)


Accuracy: 0.8054513352460826
Confusion Matrix:
 [[22  0  0 ...  0  0  0]
 [ 0  1  0 ...  0  0  0]
 [ 0  0 14 ...  0  0  0]
 ...
 [ 0  0  0 ...  5  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  1]]


In [20]:
# Example: Predict the price of a specific car
# Create a sample input with the format: ['Year', 'Make', 'Model', 'Transmission', 'Fuel Type','Engine Size','Road Tax']
sample_input = [2016, 'Audi', 'A3', 'Automatic', 'Petrol', 'Saloon', 1.4, 190, 217000]

# Encode the categorical features using the loaded encoders
for i, column in enumerate(categorical_columns):
    sample_input[i + 1] = loaded_label_encoders[column].transform([sample_input[i + 1]])[0]

# Make the prediction using the loaded RFC model
predicted_price = loaded_rfc.predict([sample_input])
print("Predicted price:", predicted_price[0])


Predicted price: 17750.0


