In [249]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Define the username and file path
username = 'iain'
file_path = f'C:\\Users\\{username}\\Desktop\\RV_Price_Info.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# remove any rows with missing values
df = df.dropna()

# drop the link column
df = df.drop('Link', axis=1)

# drop the ID column
df = df.drop('ID', axis=1)

# Assuming 'Asking' is the target variable
X = df.drop('Asking', axis=1)
y = df['Asking']

# display the data
print('Dataset')
print(X.head())

print('Target data')
print(y.head())

Dataset
       Brand Model Variant    Year Condition    Miles     Interior
0  Winnebago  Ekko     22A  2022.0      Used  20600.0  Leatherette
1  Winnebago  Ekko     22A  2022.0      Used  18664.0  Leatherette
2  Winnebago  Ekko     22A  2024.0       New      0.0  Leatherette
Target data
0    128000.0
1    157900.0
2    232056.0
Name: Asking, dtype: float64


In [250]:
# one hot encode only the following columns
columnstoonehot = ['Brand', 'Model', 'Variant', 'Condition', 'Interior']
X = pd.get_dummies(X, columns=columnstoonehot)
print(X.head())

     Year    Miles  Brand_Winnebago  Model_Ekko  Variant_22A  Condition_New   
0  2022.0  20600.0             True        True         True          False  \
1  2022.0  18664.0             True        True         True          False   
2  2024.0      0.0             True        True         True           True   

   Condition_Used  Interior_Leatherette  
0            True                  True  
1            True                  True  
2           False                  True  


In [251]:
# write the table columns to an array
columnsAfterOneHot = X.columns
print(columnsAfterOneHot)

Index(['Year', 'Miles', 'Brand_Winnebago', 'Model_Ekko', 'Variant_22A',
       'Condition_New', 'Condition_Used', 'Interior_Leatherette'],
      dtype='object')


In [252]:
# convert all columns with bools to float64
for column in X.columns:
    if X[column].dtype == 'bool':
        X[column] = X[column].astype('float64')

In [253]:
# display data types for y
print(y.dtypes)

float64


In [254]:
# create a random forest regressor model
model = RandomForestRegressor()

# create an instance of the one hot encoder
ohe = OneHotEncoder()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [255]:
# fit the model to the training data
model.fit(X_train, y_train)

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')




Mean Squared Error: 6326293444.0


In [256]:
# create a dictionary of data to predict which is manually one hot encoded
data = {
    'Year': [2022.0],
    'Miles': [10000.0],
    'Brand_Winnebago': [True],
    'Model_Ekko': [True],
    'Variant_22A': [True],
    'Condition_New': [False],
    'Condition_Used': [True],
    'Interior_Leatherette': [True]
}

print(data)

{'Year': [2022.0], 'Miles': [10000.0], 'Brand_Winnebago': [True], 'Model_Ekko': [True], 'Variant_22A': [True], 'Condition_New': [False], 'Condition_Used': [True], 'Interior_Leatherette': [True]}


In [257]:
# convert the dictionary to a dataframe
df = pd.DataFrame(data)

# display the dataframe
print(df)

# print the columns of the dataframe and dtypes
print(df.columns)

# display the data types of the columns
print(df.dtypes)

     Year    Miles  Brand_Winnebago  Model_Ekko  Variant_22A  Condition_New   
0  2022.0  10000.0             True        True         True          False  \

   Condition_Used  Interior_Leatherette  
0            True                  True  
Index(['Year', 'Miles', 'Brand_Winnebago', 'Model_Ekko', 'Variant_22A',
       'Condition_New', 'Condition_Used', 'Interior_Leatherette'],
      dtype='object')
Year                    float64
Miles                   float64
Brand_Winnebago            bool
Model_Ekko                 bool
Variant_22A                bool
Condition_New              bool
Condition_Used             bool
Interior_Leatherette       bool
dtype: object


In [258]:
# convert the Year and Miles columns to float
df['Year'] = df['Year'].astype(float)

In [259]:
price = model.predict(df)

#print the predicted price with $ sign
print(f'Predicted Price: ${price[0]:,.2f}')

Predicted Price: $152,518.00
