# With Car_name being binary_encoded

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("dataset/cleaned.csv")
df = df.drop(['Unnamed: 0'], axis=1)
df.head(2)

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000


## Splitting Dataset

In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [4]:
num_features = ["vehicle_age", "km_driven", "mileage", "engine","max_power", "seats"]
onehot_columns = ['seller_type','fuel_type','transmission_type']
binary_columns = ['car_name']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
binary_transformer = BinaryEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
         ("StandardScaler", numeric_transformer, num_features),
        ("BinaryEncoder", binary_transformer, binary_columns)
        
    ]
)

In [5]:
X = preprocessor.fit_transform(X)

In [6]:
print(X.shape)
print(type(X))

(15411, 23)
<class 'numpy.ndarray'>


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest Regressor

In [8]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [9]:
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_test, y_test))

0.9820520895909267
0.9310852553906892


## Linear Regression

In [10]:
# Linear Regression model

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [11]:
# Metrics

print(f"R2 score on train set:  {linear_model.score(X_train, y_train)}")
print(f"R2 score on test set: {linear_model.score(X_test, y_test)} ")

y_pred = linear_model.predict(X_test)

print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean Squared Error: {root_mean_squared_error(y_test, y_pred)}")

R2 score on train set:  0.6279718547677425
R2 score on test set: 0.667671661220389 
Mean Absolute Error: 273522.354208716
Root Mean Squared Error: 500170.6927588081


## Polynomial Regression

In [12]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [13]:
linear_model_2 = LinearRegression()
linear_model_2.fit(X_train_poly, y_train)

In [14]:
# Metrics

print(f"R2 score on train set:  {linear_model_2.score(X_train_poly, y_train)}")
print(f"R2 score on test set: {linear_model_2.score(X_test_poly, y_test)} ")

y_pred = linear_model_2.predict(X_test_poly)

print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
print(f"Root Mean Squared Error: {root_mean_squared_error(y_test, y_pred)}")

R2 score on train set:  0.8969902440705173
R2 score on test set: 0.5154347721380046 
Mean Absolute Error: 179529.70532382157
Root Mean Squared Error: 603963.47558098


# With original dataset and improved cleaning 

In [36]:
import pandas as pd
import numpy as np

df = pd.read_csv("dataset/cardekho_dataset.csv")
df = df.drop(["Unnamed: 0", "brand", "model"], axis=1)

Capping outliers

In [37]:
def detect_outliers(col):
    # Finding the IQR
    percentile25 = df[col].quantile(0.25)
    percentile75 = df[col].quantile(0.75)
    print('\n ####', col , '####')
    print("percentile25",percentile25)
    print("percentile75",percentile75)
    iqr = percentile75 - percentile25
    upper_limit = percentile75 + 1.5 * iqr
    lower_limit = percentile25 - 1.5 * iqr
    print("Upper limit",upper_limit)
    print("Lower limit",lower_limit)
    df.loc[(df[col]>upper_limit), col]= upper_limit
    df.loc[(df[col]<lower_limit), col]= lower_limit    
    return df

In [38]:
df.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [51]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print(num_features)
print(discrete_features)
print(continuous_features)

['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']
['vehicle_age', 'seats']
['km_driven', 'mileage', 'engine', 'max_power', 'selling_price']


In [None]:
for col in continuous_features:
         detect_outliers(col)


 #### km_driven ####
percentile25 30000.0
percentile75 70000.0
Upper limit 130000.0
Lower limit -30000.0

 #### mileage ####
percentile25 17.0
percentile75 22.7
Upper limit 31.25
Lower limit 8.450000000000001

 #### engine ####
percentile25 1197.0
percentile75 1582.0
Upper limit 2159.5
Lower limit 619.5

 #### max_power ####
percentile25 74.0
percentile75 117.3
Upper limit 182.25
Lower limit 9.050000000000011

 #### selling_price ####
percentile25 385000.0
percentile75 825000.0
Upper limit 1485000.0
Lower limit -275000.0


  df.loc[(df[col]>upper_limit), col]= upper_limit


In [43]:
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

In [44]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type','fuel_type','transmission_type']
binary_columns = ['car_name']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
binary_transformer = BinaryEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
         ("StandardScaler", numeric_transformer, num_features),
        ("BinaryEncoder", binary_transformer, binary_columns)
        
    ]
)

In [45]:
X= preprocessor.fit_transform(X)

In [46]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((12328, 23), (3083, 23))

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [48]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [49]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor()
}

In [50]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 148767.4017
- Mean Absolute Error: 112822.8151
- R2 Score: 0.8380
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 155478.8345
- Mean Absolute Error: 118417.1508
- R2 Score: 0.8343


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 38904.8687
- Mean Absolute Error: 26117.2892
- R2 Score: 0.9889
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 99114.6958
- Mean Absolute Error: 65851.4665
- R2 Score: 0.9327


