In [9]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

In [10]:
df = pd.read_csv("cardekho_imputated.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,854082.9,1307926.0,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,357003.9,465401.5,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [11]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
min_cost_price       0
max_cost_price       0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [12]:
for value in df.columns:
    print(df[value].value_counts())

Unnamed: 0
0        1
13208    1
13103    1
13105    1
13106    1
        ..
6527     1
6528     1
6529     1
6530     1
19543    1
Name: count, Length: 15411, dtype: int64
car_name
Hyundai i20           906
Maruti Swift Dzire    890
Maruti Swift          781
Maruti Alto           778
Honda City            757
                     ... 
Mercedes-AMG C          1
Tata Altroz             1
Ferrari GTC4Lusso       1
Hyundai Aura            1
Force Gurkha            1
Name: count, Length: 121, dtype: int64
brand
Maruti           4992
Hyundai          2982
Honda            1485
Mahindra         1011
Toyota            793
Ford              790
Volkswagen        620
Renault           536
BMW               439
Tata              430
Mercedes-Benz     337
Skoda             334
Audi              192
Datsun            170
Jaguar             59
Land Rover         51
Jeep               41
Kia                32
Porsche            21
Volvo              20
MG                 19
Mini               17
Nis

In [13]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(len(cat_features))
continuos_features = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print(len(continuos_features))
discrete_features = [feature for feature in num_features if len(df[feature].unique()) >= 25]
print(len(discrete_features))

10
6
2
8


In [86]:
X = df.drop(['selling_price','car_name','brand'],axis=1) # input variable
y = df['selling_price'] # target variable

In [87]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [88]:
num_features = X.select_dtypes(exclude='object').columns
oh_columns = ['seller_type','fuel_type','transmission_type']
from sklearn.preprocessing import OneHotEncoder , StandardScaler 
from sklearn.compose import ColumnTransformer

In [89]:
numeric_transformer = StandardScaler()
lb_transformer = LabelEncoder()
oh_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoding",oh_transformer,oh_columns),
        ("StandardScaler",numeric_transformer,num_features),
    ],remainder='passthrough'
)

In [90]:
X = preprocessor.fit_transform(X)

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [111]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression , Ridge , Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix,precision_score,recall_score, f1_score , roc_auc_score,mean_absolute_error , mean_squared_error,r2_score

In [112]:
def evaluate_model(true, predicted):
        model_mse = mean_squared_error(true, predicted)
        model_mae = mean_absolute_error(true, predicted)
        model_r2 = r2_score(true, predicted)
        return model_mse, model_mae, model_r2

In [114]:
models = {
    "Linear Regression":LinearRegression(),
    # "Lasso":Lasso(),
    # "Ridge":Ridge(),
    # "K-Neighbours Regression":KNeighborsRegressor(),
    # "Decision Tree":DecisionTreeRegressor(),
    # "Random Forest Regressor":RandomForestRegressor()
}
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print(list(models.keys())[i],":")
    print(evaluate_model(y_train , y_train_pred))
    print(evaluate_model(y_test , y_test_pred))

Linear Regression :
(261726429353.7832, 226071.44623235462, 0.6772932137115251)
(232187511351.65002, 234604.39846851205, 0.6915606696561969)


In [None]:
knn_params = {"n_neighbors":{2,3,10,20,40,50}}
rf_params = {"max_depth":[5,8,15,None,10],
             "max_features":{5,7,"auto",8},
             "min_sample_split":[2,8,15,20],
             "n_estimators":{100,200,500,100}
             }

In [None]:
randomcv_models = [("KNN",KNeighborsRegressor(),knn_params),
                   ("RF",RandomForestRegressor(),rf_params)
                   ]

In [18]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name,model,params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1
                                )
    random.fit(X_train , y_train)
    model_params[name] = random.best_params_
    
for model_name in model_params:
    print(model_params[model_name])

NameError: name 'randomcv_models' is not defined

In [None]:
models = {"Random Forest Regressor":RandomForestRegressor(n_estimators=100,min_samples_split=2,max_features="auto",max_depth=None,n_jobs=-1),"K-Neighbours Regressor":KNeighborsRegressor(n_neighbors=10,n_jobs=-1)}
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print(list(models.keys())[i],":")
    print(evaluate_model(X_train , y_train_pred))
    print(evaluate_model(X_test , y_test_pred))