In [1]:
import pandas as pd
import numpy as np

In [2]:
###################### Data Preprocessing ############################

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
###################### Model Development ###############################

from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [4]:
################## Accuracy Check #######################
from sklearn.metrics import r2_score

In [5]:
################ Warnings Ingore ############

import warnings
warnings.filterwarnings('ignore')

# Data Import 

In [6]:
data = pd.read_csv('SecondCar.csv')
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000


# Data Preprocessing

    1. Data duplicates -- remove
    2. Missing value > 75% -->> remove column
    3. Missing value treatment
    4. Unique Value based removal -->> when unique value == 1 or unique value == len(data)
    5. Feature engineering -->> variety reduction
    6. Label Encoding -->> Alphabetical order converted to numbers
    6.5. Correlation -->> 
    

# Step 1. Duplicate Data

In [7]:
print('Before duplicate removal  -->>', len(data))

Before duplicate removal  -->> 4340


In [8]:
data = data.drop_duplicates(keep='first')

In [9]:
print('After duplicate removal -->>', len(data))

After duplicate removal -->> 4340


# Step 2. Missing value > 75%

In [10]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                3
seller_type         2
transmission        2
owner               3
Rating              4
ExShowroom Price    0
selling_price       0
dtype: int64

In [11]:
missing_value_df = pd.DataFrame(data.isna().sum(), columns=['missing_value_count'])
missing_value_df

Unnamed: 0,missing_value_count
name,0
year,0
km_driven,0
fuel,3
seller_type,2
transmission,2
owner,3
Rating,4
ExShowroom Price,0
selling_price,0


In [12]:
missing_value_df['perc'] = 100*missing_value_df['missing_value_count']/len(data)
missing_value_df

Unnamed: 0,missing_value_count,perc
name,0,0.0
year,0,0.0
km_driven,0,0.0
fuel,3,0.069124
seller_type,2,0.046083
transmission,2,0.046083
owner,3,0.069124
Rating,4,0.092166
ExShowroom Price,0,0.0
selling_price,0,0.0


##### filter dataframe based on threshold value

In [13]:
missing_value_df[missing_value_df['perc']>75]

Unnamed: 0,missing_value_count,perc


##### Store list of columns to be deleted

In [14]:
columns_to_be_deleted = list(missing_value_df[missing_value_df['perc']>75].index)
columns_to_be_deleted

[]

##### Delete columns from original data

In [15]:
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000


In [16]:
data.drop(columns=columns_to_be_deleted, inplace=True)
data.head(2)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,Rating,ExShowroom Price,selling_price
0,Tata Indica Vista Aqua 1.4 TDI,2010,120000,Diesel,Individual,Manual,Second Owner,9.0,106001,100000
1,Tata Nano Lx BSIV,2012,50000,Petrol,Individual,Manual,Second Owner,9.0,108556,100000


In [17]:
# missing_value_df = pd.DataFrame(data.isna().sum(), columns=['missing_value_count'])
# missing_value_df['perc'] = 100*missing_value_df['missing_value_count']/len(data)
# columns_to_be_deleted = list(missing_value_df[missing_value_df['perc']>75].index)
# data.drop(columns=columns_to_be_deleted, inplace=True)

# Step 3. Missing Value Treatment

In [18]:
data.isna().sum()

name                0
year                0
km_driven           0
fuel                3
seller_type         2
transmission        2
owner               3
Rating              4
ExShowroom Price    0
selling_price       0
dtype: int64

In [19]:
for col in data.columns:
    
    if data[col].dtype == 'object':
#         print(col, '-->>', data[col].dtype, '-->>', data[col].mode()[0])
       data[col]=data[col].fillna(data[col].mode()[0], inplace = True)
    
    if data[col].dtype != 'object':
#         print('*'*40, col, '-->>', data[col].dtype, '-->>', data[col].median())
      data[col]=data[col].fillna(data[col].median(), inplace = True)

In [20]:
data.isna().sum()

name                4340
year                4340
km_driven           4340
fuel                4340
seller_type         4340
transmission        4340
owner               4340
Rating              4340
ExShowroom Price    4340
selling_price       4340
dtype: int64

# Mean, Mode, Median

In [21]:
data['fuel'].mode()[0]

KeyError: 0

In [None]:
data['km_driven'].median()

In [None]:
data['km_driven'].mean()

# Unique Value Check

In [None]:
data['dummy_col'] = 'vinay'
data.head(2)

In [None]:
data['dummy_col'].unique()

In [None]:
for col in data.columns:
    if (data[col].nunique() == 1):
        del data[col]
    elif ((data[col].nunique() == len(data)) & (data[col].dtype == 'object')):
        del data[col]

# Feature Engineering

In [None]:
for col in data.columns:
    print(col, '-->>', data[col].nunique(), '-->>', data[col].dtype)

# Object Column Variety reduction --->> Needs to be done manually

In [None]:
data['name']

In [None]:
data['name'].str.split(" ").str[0]

In [None]:
data['company_name'] = data['name'].str.split(" ").str[0]
data.head()

In [None]:
name = 'Mahendra*Singh*Dhoni'

In [None]:
name.split("*")[2]

In [None]:
data['company_name'].nunique()

# Numerical Columns -->> Automatic feature engineering

In [None]:
round(data.describe(),2)

In [None]:
range_labels = ['new','medium','high','extreme']
range_limits = [0,35000,60000,90000,1000000]

data['km_bins'] = pd.cut(data['km_driven'], labels=range_labels, bins = range_limits)
data.head()

In [None]:
# import plotly.express as px

In [None]:
# fig = px.pie(data, names = 'km_bins', values = 'km_driven')
# fig.show()

In [None]:
for col in data.columns:
    if data[col].nunique()/len(data) > 0.05:
        if data[col].dtype == 'object':
            print('Please perform manual feature engineering for -->>', col)
        else:
            print('Auto Feature Engineering for -->>', col)
            new_col_name = col + '_bin'
            data[new_col_name] = pd.qcut(data[col],
                                         4, 
                                         labels = ['b1','b2','b3','b4'])

In [None]:
data

# Label Encoding

In [None]:
data['fuel'].unique()

In [None]:
LN = LabelEncoder()

In [None]:
data['fuel'] = LN.fit_transform(data['fuel'])

In [None]:
data['fuel'].unique()

In [None]:
data.info()

In [None]:
data

In [None]:
for col in data.columns:
    if (data[col].dtype == 'object') | (data[col].dtype =='category'):
        data[col] = LN.fit_transform(data[col])

In [None]:
data

# Correlation

In [None]:
corr = 100*data.corr()
corr.style.background_gradient(cmap = 'coolwarm')

In [None]:
data.drop(columns=['name','Rating', 'ExShowroom Price'], inplace=True)

In [None]:
corr = 100*data.corr()
corr.style.background_gradient(cmap = 'coolwarm')

# Train Test Split

In [None]:
x = data.drop(columns = 'selling_price')
y = data['selling_price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state = 1234)

In [None]:
len(x_train)

In [None]:
len(y_train)

# Model Development

    1. Model Name
    2. Training
    3. Exam - Predicted Answers
    4. Accuracy Check

##### Step 1. Model Declaration

In [None]:
model = LinearRegression()

##### Step 2. Model training (fitting)

In [None]:
model.fit(x_train,y_train)

##### Step 3. Prediction

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred

##### Step 4. Accuracy

In [None]:
accuracy = 100* r2_score(y_test, y_pred)
accuracy

# Running all model

In [None]:
models = [LinearRegression(), Lasso(), ElasticNet(), KNeighborsRegressor(),
         DecisionTreeRegressor(), RandomForestRegressor(), AdaBoostRegressor(),
          GradientBoostingRegressor()]

In [None]:
acc_dic = {} 

In [None]:
for model in models:
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    accuracy = 100* r2_score(y_test, y_pred)
    acc_dic[model] = round(accuracy,2)
    

In [None]:
acc_dic