# Approach 2
This time I am keeping these columns to improve results: engine, max_power, seats

- This time I have decided to keep some additional columns from the original dataset.
- Initially, I thought these columns are not very useful.
- Since I did not get good accuracy on the first attempt and since these columns are quantitative numeric values, I decided to keep them in the dataset this time.

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("dataset/cleaned.csv")
df = df.drop(['Unnamed: 0', 'car_name'], axis=1)
df = pd.get_dummies(df, columns=['seller_type', 'fuel_type', 'transmission_type'], dtype=int)
df.head(2)

Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,fuel_type_CNG,fuel_type_Diesel,fuel_type_Electric,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual
0,9,120000,19.7,796,46.3,5,120000,0,1,0,0,0,0,0,1,0,1
1,5,20000,18.9,1197,82.0,5,550000,0,1,0,0,0,0,0,1,0,1


### Splitting Dataset

In [3]:
X = df.drop(['selling_price'], axis=1).to_numpy()
y = df.iloc[:,6].to_numpy()

In [4]:
# X = X.astype('int64')


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest Regressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [12]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_test, y_test))

0.9788005650813197
0.9239521186971583


### Linear Regression

In [7]:
# Scaling data for linear regression

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Linear Regression model

linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

In [13]:
# Metrics

print(f"R2 score on train set:  {linear_model.score(X_train_scaled, y_train)}")
print(f"R2 score on test set: {linear_model.score(X_test_scaled, y_test)} ")

y_pred = linear_model.predict(X_test_scaled)

print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean Squared Error: {root_mean_squared_error(y_test, y_pred)}")

R2 score on train set:  0.6213545435813935
R2 score on test set: 0.6651094164667595 
Mean Absolute Error: 278351.6282614204
Root Mean Squared Error: 502095.14346199314


### Polynomial Regression

In [14]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [15]:
linear_model_2 = LinearRegression()
linear_model_2.fit(X_train_poly, y_train)

In [17]:
# Metrics

print(f"R2 score on train set:  {linear_model_2.score(X_train_poly, y_train)}")
print(f"R2 score on test set: {linear_model_2.score(X_test_poly, y_test)} ")

y_pred = linear_model_2.predict(X_test_poly)

print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean Squared Error: {root_mean_squared_error(y_test, y_pred)}")

R2 score on train set:  0.8656277521062425
R2 score on test set: 0.560180478279164 
Mean Absolute Error: 194337.91357969135
Root Mean Squared Error: 575402.5782445535


### Polynomial Regression (Unscaled)

In [18]:
poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [19]:
linear_model_3 = LinearRegression()
linear_model_3.fit(X_train_poly, y_train)

In [20]:
# Metrics

print(f"R2 score on train set:  {linear_model_3.score(X_train_poly, y_train)}")
print(f"R2 score on test set: {linear_model_3.score(X_test_poly, y_test)} ")

y_pred = linear_model_3.predict(X_test_poly)

print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean Squared Error: {root_mean_squared_error(y_test, y_pred)}")

R2 score on train set:  0.8654629100933335
R2 score on test set: 0.5575693924454552 
Mean Absolute Error: 194554.17047919432
Root Mean Squared Error: 577108.0530710728
