In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


from sklearn.metrics import mean_squared_error, r2_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

train_data_path = '/content/drive/My Drive/train_data.csv'
test_data_path='/content/drive/My Drive/test_data.csv'

Mounted at /content/drive


In [3]:
# load data
train_data=pd.read_csv(train_data_path)
test_data=pd.read_csv(test_data_path)

# this was causing error
test_data = test_data[test_data['model'].isin(train_data['model'])]

In [4]:
# categorical features
categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()
print(categorical_features)

['manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'type', 'paint_color', 'state']


In [5]:
# split data into features and target variables
X_train = train_data.drop(columns=['price'])
y_train = train_data['price']

X_test = test_data.drop(columns=['price'])
y_test = test_data['price']

print("Number of rows in X_train: {} ".format(X_train.shape[0]))
print("Number of rows in y_train: {} ".format(y_train.shape[0]))
print("Number of rows in X_test: {} ".format(X_test.shape[0]))
print("Number of rows in y_test: {} ".format(y_test.shape[0]))

# print(X_test[X_test['model']=='pulsar'])
# print(X_train[X_train['model']=='pulsar'])

Number of rows in X_train: 252328 
Number of rows in y_train: 252328 
Number of rows in X_test: 60945 
Number of rows in y_test: 60945 


In [6]:
X_test.head()

Unnamed: 0,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,lat,long
0,28.0,ford,bronco xlt,good,8,gas,127750.0,clean,automatic,4wd,sedan,white,co,38.819247,-104.839096
1,13.0,chevrolet,traverse,good,6,gas,135362.0,clean,automatic,4wd,sedan,white,mi,43.1824,-84.1122
2,20.0,mercedes-benz,s-class,like new,4,gas,61121.0,clean,automatic,rwd,convertible,black,fl,27.888284,-82.744474
3,5.0,cadillac,cts,good,6,gas,26570.0,clean,other,4wd,sedan,silver,ga,32.51,-84.87
4,6.0,jeep,wrangler,excellent,6,gas,45000.0,clean,automatic,4wd,SUV,blue,ak,64.878089,-148.214529


In [7]:
label_encoders = {}

# Encode categorical features in training data
for feature in X_train.columns[X_train.dtypes == 'object']:
    label_encoders[feature] = LabelEncoder()
    X_train[feature] = label_encoders[feature].fit_transform(X_train[feature])


print(X_train.shape[0])

print(label_encoders)

for feature in X_test.columns[X_test.dtypes == 'object']:
    X_test[feature] = label_encoders[feature].transform(X_test[feature])



252328
{'manufacturer': LabelEncoder(), 'model': LabelEncoder(), 'condition': LabelEncoder(), 'cylinders': LabelEncoder(), 'fuel': LabelEncoder(), 'title_status': LabelEncoder(), 'transmission': LabelEncoder(), 'drive': LabelEncoder(), 'type': LabelEncoder(), 'paint_color': LabelEncoder(), 'state': LabelEncoder()}


In [8]:
# condition_counts =train_data['condition'].value_counts()
# print(condition_counts)

In [9]:
# # # labeled encodeing for vehicle's condition
# condition={'new':1,'like new':2,'excellent':3,'good':4,'fair':5,'salvage':6}

# train_data['condition'] = train_data['condition'].map(condition)

In [10]:
# cylinders_counts =train_data['cylinders'].value_counts()
# print(cylinders_counts)

In [11]:
# # change other to -1
# train_data['cylinders'].replace('other', -1, inplace=True)

# # train_data['cylinders'].dtypes
# train_data['cylinders'] = train_data['cylinders'].astype(int)

In [12]:
# fuel_counts =train_data['fuel'].value_counts()
# print(fuel_counts)

In [13]:
# fuel={'gas':1,'diesel':2,'electric':3,'hybrid':4,'other':-1}

# train_data['fuel'] = train_data['fuel'].map(fuel)

In [14]:
# title_status_count =train_data['title_status'].value_counts()
# print(title_status_count)

In [15]:
# title_status={'clean':1,'rebuilt':2,'salvage':3,'lien':4,'parts only':5,'missing':6}

# train_data['title_status'] = train_data['title_status'].map(title_status)

In [16]:
# transmission_count =train_data['transmission'].value_counts()
# print(transmission_count)

In [17]:
# transmission={'automatic':1,'manual':2,'other':3}

# train_data['transmission'] = train_data['transmission'].map(transmission)

In [18]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10]
# }

In [19]:
# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)

In [20]:
# grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1,n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [21]:
# best_rf_model = grid_search.best_estimator_

# # Evaluate the model on test data
# y_pred = best_rf_model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

In [22]:
# best_rf_model = grid_search.best_estimator_

In [23]:
# Fit the regressor with x and y data
regressor.fit(X_train, y_train)

In [24]:

model_name = type(regressor).__name__
print(model_name)

# Access the OOB Score
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')

# Making predictions on the same data or new data
predictions = regressor.predict(X_train)

print('\nTraining Scores')

# Evaluating the model
mse = mean_squared_error(y_train, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_train, predictions)
print(f'R-squared: {r2}')

test_predictions = regressor.predict(X_test)


print('\nTest Scores')
# Evaluating the model
mse = mean_squared_error(y_test, test_predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, test_predictions)
print(f'R-squared: {r2}')

RandomForestRegressor
Out-of-Bag Score: -0.14810196375141405

Training Scores
Mean Squared Error: 29122340312979.63
R-squared: 0.8509378864608771

Test Scores
Mean Squared Error: 257629183514787.06
R-squared: -0.12232149905260559
