## Importing the required modules

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score

from typing import Tuple

## Reading the data from .csv file to the arrays of numpy

In [3]:
df = pd.read_csv("../data/Car details v3.csv")

In [4]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [5]:
df_copy = df.copy()

## Klausimai:
### Ar gera mintis pašalinti simbolius paliekant tik skačius?
### Kaip reikėtų elgtis su 'torque' stulpelio duomenimis?

In [6]:
#Removing extra symbols and leaving just numbers

df_copy['engine'] = df_copy['engine'].str.replace('CC', '')
df_copy['mileage'] = df_copy['mileage'].str.replace('kmpl', '')
df_copy['mileage'] = df_copy['mileage'].str.replace('km/kg', '')
df_copy['max_power'] = df_copy['max_power'].str.replace('bhp', '')
df_copy = df_copy.drop('torque', axis='columns')
df_copy.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298,88.2,5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14,1197,81.86,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3,1061,57.5,5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1,796,37.0,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59,1364,67.1,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0,1399,68.1,5.0


## KLausimai:
### Kokia taktika renkantis kaip užpildyti na laukus?

In [7]:
#Replacing na values with the mean of the column

df_copy['mileage'].fillna(df_copy['mileage'].mean, inplace=True)
df_copy['engine'].fillna(df_copy['engine'].mean, inplace=True)
df_copy['max_power'].fillna(df_copy['max_power'].mean, inplace=True)
df_copy['seats'].fillna(df_copy['seats'].mean, inplace=True)
df_copy.isna().sum()
df_copy.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298,88.2,5.0


In [8]:
#.csv data reading function
def load_dataset(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 
    dataset = df.values
    X = np.concatenate((dataset[: , 0:2], dataset[: , 3:]), axis=1)
    y = dataset[:,2]
    X = X.astype(str)
    y = y.reshape(len(y), 1)
    return X, y

X, y = load_dataset(df_copy)

## Splitting into a test and a train dataset

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (5445, 11) (5445, 1)
Test (2683, 11) (2683, 1)


## Encoding categorical values

## Klausimai:
### Kaip enkodinti tik kai kuriuos stulpelius?
### OrdinalEncoder ir OneHotEncoder skirtumai

In [12]:
#Using OrdinalEncoder to encode categorical values to int values
def encode_inputs_by_oe(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

X_train_oe_enc, X_test_oe_enc = encode_inputs_by_oe(X_train, X_test)

In [20]:
X_train_oe_enc[1]

array([154.,  20.,  51.,   3.,   1.,   1.,   0.,  71.,  42., 292.,   4.])

In [138]:
#Using OneHotEncoder to encode categorical values into binary vectors
def encode_inputs_by_ohe(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    ohe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    ohe.fit(X_train)
    X_train_enc = ohe.transform(X_train)
    X_test_enc = ohe.transform(X_test)
    return X_train_enc, X_test_enc

X_train_ohe_enc, X_test_ohe_enc = encode_inputs_by_ohe(X_train, X_test)

In [141]:
X_train_ohe_enc[1:5]

array([[1.540e+02, 2.000e+01, 5.100e+01, 3.000e+00, 1.000e+00, 1.000e+00,
        0.000e+00, 7.100e+01, 4.200e+01, 2.920e+02, 4.000e+00],
       [1.174e+03, 1.800e+01, 6.760e+02, 3.000e+00, 1.000e+00, 1.000e+00,
        4.000e+00, 1.890e+02, 1.100e+02, 1.990e+02, 4.000e+00],
       [4.180e+02, 1.900e+01, 5.320e+02, 3.000e+00, 1.000e+00, 1.000e+00,
        0.000e+00, 2.560e+02, 1.050e+02, 1.680e+02, 4.000e+00],
       [6.190e+02, 2.200e+01, 3.340e+02, 3.000e+00, 1.000e+00, 1.000e+00,
        0.000e+00, 1.480e+02, 1.000e+01, 2.380e+02, 4.000e+00]])

## Scaling feature columns

In [14]:
def scale_inputs(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_scaled = ss.transform(X_train)
    X_test_scaled = ss.transform(X_test)
    return X_train_scaled, X_test_scaled

X_train_scaled, X_test_scaled = scale_inputs(X_train_oe_enc, X_test_oe_enc)

In [15]:
X_train_scaled[1:5]

array([[-1.56155366, -0.19509214, -1.31631429,  1.1089115 ,  0.27545946,
         0.39316683, -0.66540842, -1.45065126, -0.12246575,  1.35754177,
        -0.43181166],
       [ 0.62534386, -0.6883713 ,  1.54060941,  1.1089115 ,  0.27545946,
         0.39316683,  2.59281895, -0.19819327,  1.70777313,  0.2561118 ,
        -0.43181166],
       [-0.99553313, -0.44173172,  0.88237419,  1.1089115 ,  0.27545946,
         0.39316683, -0.66540842,  0.51294813,  1.57319675, -0.11103152,
        -0.43181166],
       [-0.56458568,  0.29818703, -0.02269924,  1.1089115 ,  0.27545946,
         0.39316683, -0.66540842, -0.63336936, -0.98375464,  0.71800179,
        -0.43181166]])

## Creating LinearRegression model

In [16]:
lreg = LinearRegression()
lreg.fit(X_train_scaled, y_train)
print(lreg.score(X_test_scaled, y_test) * 100)


52.85959040759516


In [18]:
y_pred = lreg.predict(X_test_scaled)

In [19]:
print("MAE", mean_absolute_error(y_test, y_pred))
print("MSE", mean_squared_error(y_test, y_pred))

MAE 312434.8609435276
MSE 289562312433.9383
