In [139]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Define the username and file path
file_path = "D:/RVPrice/rvpriceprediction/RV_Price_Info.csv"

# Read the CSV file
df = pd.read_csv(file_path)

# print df
print(df.head())

# remove any rows with missing values
df = df.dropna()

# drop the link column
df = df.drop('Link', axis=1)

# drop the ID column
df = df.drop('ID', axis=1)

# Assuming 'Asking' is the target variable
X = df.drop('Asking', axis=1)
y = df['Asking']

# display the data
print('Dataset')
print(X.head())

print('Target data')
print(y.head())

   ID      Brand Model Variant  Year Condition  Miles  Asking   
0   1  Winnebago  Ekko     22A  2022      Used  20600  128000  \
1   2  Winnebago  Ekko     22A  2022      Used  18664  157900   
2   4  Winnebago  Ekko     22A  2022      Used  32000  152000   
3   5  Winnebago  Ekko     22A  2022      Used  13632  144900   
4   6  Winnebago  Ekko     22A  2022      Used  56000  125000   

                                                Link  
0  https://www.rvtrader.com/listing/2022-Winnebag...  
1  https://www.rvtrader.com/listing/2022-Winnebag...  
2  https://www.rvtrader.com/listing/2022-Winnebag...  
3  https://www.rvtrader.com/listing/2022-Winnebag...  
4  https://www.lamesarv.com/inventory/class-c/win...  
Dataset
       Brand Model Variant  Year Condition  Miles
0  Winnebago  Ekko     22A  2022      Used  20600
1  Winnebago  Ekko     22A  2022      Used  18664
2  Winnebago  Ekko     22A  2022      Used  32000
3  Winnebago  Ekko     22A  2022      Used  13632
4  Winnebago  Ekko   

In [140]:
# one hot encode only the following columns
columnstoonehot = ['Brand', 'Model', 'Variant', 'Condition']
X = pd.get_dummies(X, columns=columnstoonehot)
print(X.head())

   Year  Miles  Brand_Winnebago  Model_Ekko  Variant_22A  Condition_New   
0  2022  20600             True        True         True          False  \
1  2022  18664             True        True         True          False   
2  2022  32000             True        True         True          False   
3  2022  13632             True        True         True          False   
4  2022  56000             True        True         True          False   

   Condition_Used  
0            True  
1            True  
2            True  
3            True  
4            True  


In [141]:
# write the table columns to an array
columnsAfterOneHot = X.columns
print(columnsAfterOneHot)

Index(['Year', 'Miles', 'Brand_Winnebago', 'Model_Ekko', 'Variant_22A',
       'Condition_New', 'Condition_Used'],
      dtype='object')


In [142]:
# convert all columns with bools to float64
for column in X.columns:
    if X[column].dtype == 'bool':
        X[column] = X[column].astype('float64')

In [143]:
# display data types for y
print(y.dtypes)

int64


In [144]:
# create a random forest regressor model
model = RandomForestRegressor()

# create an instance of the one hot encoder
ohe = OneHotEncoder()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [145]:
# fit the model to the training data
model.fit(X_train, y_train)

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')




Mean Squared Error: 343100977.0


In [146]:
# create a dictionary of data to predict which is manually one hot encoded
data = {
    'Year': [2022.0],
    'Miles': [10000.0],
    'Brand_Winnebago': [True],
    'Model_Ekko': [True],
    'Variant_22A': [True],
    'Condition_New': [False],
    'Condition_Used': [True],
}

print(data)

{'Year': [2022.0], 'Miles': [10000.0], 'Brand_Winnebago': [True], 'Model_Ekko': [True], 'Variant_22A': [True], 'Condition_New': [False], 'Condition_Used': [True]}


In [147]:
# convert the dictionary to a dataframe
df = pd.DataFrame(data)

# display the dataframe
print(df)

# print the columns of the dataframe and dtypes
print(df.columns)

# display the data types of the columns
print(df.dtypes)

     Year    Miles  Brand_Winnebago  Model_Ekko  Variant_22A  Condition_New   
0  2022.0  10000.0             True        True         True          False  \

   Condition_Used  
0            True  
Index(['Year', 'Miles', 'Brand_Winnebago', 'Model_Ekko', 'Variant_22A',
       'Condition_New', 'Condition_Used'],
      dtype='object')
Year               float64
Miles              float64
Brand_Winnebago       bool
Model_Ekko            bool
Variant_22A           bool
Condition_New         bool
Condition_Used        bool
dtype: object


In [148]:
# convert the Year and Miles columns to float
df['Year'] = df['Year'].astype(float)

In [149]:
price = model.predict(df)

#print the predicted price with $ sign
print(f'Predicted Price: ${price[0]:,.2f}')

Predicted Price: $145,915.00


In [150]:
# predict the price for all the data
price = model.predict(X)

# print the predicted price with $ sign and as a table
# iterate over the predicted prices
for i in range(len(price)):
    print(f'Predicted Price: ${price[i]:,.2f}')

Predicted Price: $135,435.00
Predicted Price: $145,900.00
Predicted Price: $147,495.00
Predicted Price: $146,801.00
Predicted Price: $147,495.00
Predicted Price: $145,577.00
Predicted Price: $145,577.00
