In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import math
from sklearn.metrics import mean_squared_error

In [2]:
training_data = pd.read_csv('Training_Data_Set.csv')
test_data = pd.read_csv('Test_Data_Set.csv')

In [3]:
training_data.columns = training_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

In [4]:
training_data.isnull().sum()
test_data.isnull().sum()

Id                        0
Maker                     0
model                     0
Location                  0
Distance                128
Owner Type                0
manufacture_year          0
Age of car                0
engine_displacement       0
engine_power            168
body_type              5193
Vroom Audit Rating        0
transmission              0
door_count              806
seat_count              891
fuel_type                 0
dtype: int64

In [5]:
avg_dis_by_age = training_data.groupby('Age of car')['Distance'].mean().reset_index()
avg_dis_by_age.columns = ['Age of car','Average_distance'] 
training_data = training_data.merge(avg_dis_by_age, on='Age of car', how='left')
training_data['Distance'].fillna(training_data['Average_distance'], inplace=True)
training_data.drop(columns=['Average_distance'], inplace=True)

test_avg_dis_by_age = test_data.groupby('Age of car')['Distance'].mean().reset_index()
test_avg_dis_by_age.columns = ['Age of car','Average_distance'] 
test_data = test_data.merge(avg_dis_by_age, on='Age of car', how='left')
test_data['Distance'].fillna(test_data['Average_distance'], inplace=True)
test_data.drop(columns=['Average_distance'], inplace=True)

In [6]:
training_data.dropna(subset = ['Distance'], inplace= True)
test_data.dropna(subset = ['Distance'], inplace= True)

In [7]:
training_mean_engine_power = training_data.groupby(['Maker','model'])['engine_power'].transform('mean')
training_data['engine_power'].fillna(training_mean_engine_power, inplace=True)
training_data.isnull().sum()

test_mean_engine_power = test_data.groupby(['Maker','model'])['engine_power'].transform('mean')
test_data['engine_power'].fillna(test_mean_engine_power, inplace=True)
test_data.isnull().sum()

Id                        0
Maker                     0
model                     0
Location                  0
Distance                  0
Owner Type                0
manufacture_year          0
Age of car                0
engine_displacement       0
engine_power              0
body_type              5193
Vroom Audit Rating        0
transmission              0
door_count              805
seat_count              890
fuel_type                 0
dtype: int64

In [8]:
training_door_count_filler = training_data.groupby(['Maker','model'])['door_count'].transform('mean')
training_data['door_count'].fillna(training_door_count_filler, inplace=True)
training_data.isnull().sum()

test_door_count_filler = test_data.groupby(['Maker','model'])['door_count'].transform('mean')
test_data['door_count'].fillna(test_door_count_filler, inplace=True)
test_data.isnull().sum()

Id                        0
Maker                     0
model                     0
Location                  0
Distance                  0
Owner Type                0
manufacture_year          0
Age of car                0
engine_displacement       0
engine_power              0
body_type              5193
Vroom Audit Rating        0
transmission              0
door_count                0
seat_count              890
fuel_type                 0
dtype: int64

In [9]:
training_seat_count_filler = training_data.groupby(['Maker','model'])['seat_count'].transform('mean')
training_data['seat_count'].fillna(training_seat_count_filler, inplace=True)
training_data.isnull().sum()

test_seat_count_filler = test_data.groupby(['Maker','model'])['seat_count'].transform('mean')
test_data['seat_count'].fillna(test_seat_count_filler, inplace=True)
test_data.isnull().sum()

Id                        0
Maker                     0
model                     0
Location                  0
Distance                  0
Owner Type                0
manufacture_year          0
Age of car                0
engine_displacement       0
engine_power              0
body_type              5193
Vroom Audit Rating        0
transmission              0
door_count                0
seat_count                0
fuel_type                 0
dtype: int64

In [10]:
col_to_code = ['Maker','model','Location','Owner Type','transmission','body_type','fuel_type']
label_encoder = LabelEncoder()

for column in col_to_code:
    training_data[column] = label_encoder.fit_transform(training_data[column])

training_data.head(20)

Unnamed: 0,Id,Maker,model,Location,Distance,Owner Type,manufacture_year,Age of car,engine_displacement,engine_power,body_type,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type,Price
1,25002,2,9,0,27750.0,3,2012,7,1242,51.0,2,6,1,4.0,4.0,1,401819.25
2,25003,1,18,5,46000.0,3,2014,5,1995,105.0,2,7,0,4.0,5.0,0,2392855.5
3,25004,5,6,9,43949.0,3,2011,8,1618,140.0,2,7,1,4.0,5.0,1,958606.5
4,25005,1,20,6,59524.0,1,2012,7,2993,180.0,2,7,0,4.0,5.0,0,3085561.5
5,25006,6,8,2,12015.0,0,2015,4,1968,110.0,2,4,1,4.0,5.0,0,1543728.75
6,25007,6,16,3,181000.0,1,2009,10,1968,125.0,2,6,0,5.0,5.0,0,915985.5
7,25008,2,9,2,33100.0,2,2010,9,1108,40.0,2,5,1,5.0,5.0,1,327535.5
8,25009,5,13,3,17375.0,0,2015,4,1600,96.0,2,5,1,4.0,5.0,0,1361480.25
9,25010,6,16,9,97640.0,1,2010,9,2000,103.0,2,6,1,5.0,5.0,0,885455.25
10,25011,6,16,7,208000.0,0,2010,9,1800,118.0,2,4,1,4.284274,4.999584,1,721687.5


In [11]:
col_to_code = ['Maker','model','Location','Owner Type','transmission','body_type','fuel_type']
label_encoder = LabelEncoder()

for column in col_to_code:
    test_data[column] = label_encoder.fit_transform(test_data[column])

test_data.head(20)

Unnamed: 0,Id,Maker,model,Location,Distance,Owner Type,manufacture_year,Age of car,engine_displacement,engine_power,body_type,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type
0,11001,6,8,9,150000.0,0,2007,12,1595,75.0,2,5,1,4.0,5.0,1
1,11002,6,14,3,29376.0,3,2014,5,1598,77.0,2,4,1,4.0,5.0,0
2,11003,6,8,3,30563.0,2,2014,5,1968,110.0,2,5,1,5.0,5.0,0
3,11004,0,10,4,8650.0,1,2015,4,1968,110.0,2,8,0,4.0,5.0,0
4,11005,0,17,2,6400.0,3,2015,4,1984,169.0,2,4,0,2.0,2.810127,1
5,11006,0,17,4,3000.0,3,2015,4,1968,135.0,2,8,1,2.0,2.0,0
6,11007,1,19,2,10.0,1,2015,4,2979,20.0,2,5,0,4.028455,5.0,1
7,11008,0,10,1,18000.0,3,2014,5,1968,130.0,2,6,0,4.0,5.0,0
8,11009,6,8,1,270.0,1,2006,13,2000,103.0,0,5,1,4.476098,4.997947,1
9,11010,7,0,4,3000.0,3,2015,4,1197,85.0,2,5,1,4.0,5.0,1


In [12]:
training_data.corr()

Unnamed: 0,Id,Maker,model,Location,Distance,Owner Type,manufacture_year,Age of car,engine_displacement,engine_power,body_type,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type,Price
Id,1.0,0.001615,-0.001927,0.006339,-0.000225,0.003154,0.000158,-0.000158,0.0037,0.002728,0.006579,0.005617,-0.005695,-0.007645,0.001013,0.00207,0.003074
Maker,0.001615,1.0,-0.219529,0.004917,0.045041,-0.001996,-0.069026,0.069026,-0.195933,-0.584883,-0.14739,-0.004522,0.379316,0.253499,0.136472,0.246082,-0.568787
model,-0.001927,-0.219529,1.0,-0.002753,-0.010055,0.005092,0.064304,-0.064304,0.092363,0.224918,0.038468,0.00565,-0.220076,0.060911,0.169947,-0.192545,0.238964
Location,0.006339,0.004917,-0.002753,1.0,0.00463,0.007033,-0.004326,0.004326,0.004378,-0.001176,-0.00418,-0.000948,0.010257,-0.00536,0.004674,0.003788,-0.003656
Distance,-0.000225,0.045041,-0.010055,0.00463,1.0,-0.004085,-0.231917,0.231917,0.098698,0.007896,-0.223962,-0.006398,0.01679,0.060035,0.027702,-0.009578,-0.162677
Owner Type,0.003154,-0.001996,0.005092,0.007033,-0.004085,1.0,-0.005155,0.005155,0.002343,0.004117,-0.007539,0.000553,-0.001039,0.006532,0.00588,-0.006564,-0.003143
manufacture_year,0.000158,-0.069026,0.064304,-0.004326,-0.231917,-0.005155,1.0,-1.0,-0.091431,0.051982,0.24901,0.003433,-0.131346,0.088486,0.048654,-0.111782,0.510986
Age of car,-0.000158,0.069026,-0.064304,0.004326,0.231917,0.005155,-1.0,1.0,0.091431,-0.051982,-0.24901,-0.003433,0.131346,-0.088486,-0.048654,0.111782,-0.510986
engine_displacement,0.0037,-0.195933,0.092363,0.004378,0.098698,0.002343,-0.091431,0.091431,1.0,0.322559,-0.259227,-0.00583,-0.207744,0.043157,0.084838,-0.125757,0.162081
engine_power,0.002728,-0.584883,0.224918,-0.001176,0.007896,0.004117,0.051982,-0.051982,0.322559,1.0,0.053322,-0.001302,-0.507751,-0.088854,0.053876,-0.290576,0.625408


In [13]:
test_data.corr()

Unnamed: 0,Id,Maker,model,Location,Distance,Owner Type,manufacture_year,Age of car,engine_displacement,engine_power,body_type,Vroom Audit Rating,transmission,door_count,seat_count,fuel_type
Id,1.0,0.000836,-0.015273,0.014268,-0.00027,-0.003398,0.019269,-0.019269,0.006504,0.005566,0.000597,-0.005915,0.002167,0.021598,0.000744,-0.003757
Maker,0.000836,1.0,-0.217939,0.008921,0.049459,-0.002622,-0.073526,0.073526,-0.179544,-0.553681,-0.150946,-0.016831,0.361188,0.262157,0.157174,0.24493
model,-0.015273,-0.217939,1.0,0.002406,-0.021415,0.025842,0.086786,-0.086786,0.08819,0.22491,0.030814,0.012794,-0.233795,0.037913,0.129352,-0.188231
Location,0.014268,0.008921,0.002406,1.0,-0.002063,0.037389,-0.002505,0.002505,-0.00269,-0.002365,0.007839,-0.012778,0.000471,-0.007943,-0.026402,0.017403
Distance,-0.00027,0.049459,-0.021415,-0.002063,1.0,0.001609,-0.217622,0.217622,0.168568,-0.00383,-0.234631,0.026116,0.034859,0.055081,0.021929,0.004889
Owner Type,-0.003398,-0.002622,0.025842,0.037389,0.001609,1.0,0.014577,-0.014577,-0.014772,0.01174,0.014078,-0.001248,0.016985,0.02228,0.008226,0.008071
manufacture_year,0.019269,-0.073526,0.086786,-0.002505,-0.217622,0.014577,1.0,-1.0,-0.080236,0.05807,0.25973,-0.00427,-0.126984,0.0953,0.062367,-0.131252
Age of car,-0.019269,0.073526,-0.086786,0.002505,0.217622,-0.014577,-1.0,1.0,0.080236,-0.05807,-0.25973,0.00427,0.126984,-0.0953,-0.062367,0.131252
engine_displacement,0.006504,-0.179544,0.08819,-0.00269,0.168568,-0.014772,-0.080236,0.080236,1.0,0.322807,-0.255741,0.009234,-0.191045,0.041424,0.075604,-0.145393
engine_power,0.005566,-0.553681,0.22491,-0.002365,-0.00383,0.01174,0.05807,-0.05807,0.322807,1.0,0.050691,0.006713,-0.491462,-0.084783,0.030614,-0.295531


In [14]:
train_X = (training_data[['Maker','model','Distance', 'Owner Type','manufacture_year',
        'Age of car', 'engine_displacement', 'engine_power', 'transmission', 'door_count',
       'seat_count', 'fuel_type']])

train_y = (training_data['Price'])

In [15]:
X_train, X_test, y_train, y_test=train_test_split(train_X,train_y,test_size=0.2, random_state=1)

In [16]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [17]:
predicted_price = reg.predict(X_test)

In [18]:
mse = mean_squared_error(y_test, predicted_price)
rmse = math.sqrt(mse)
print("RMSE is : ",rmse)
ac = reg.score(X_test,y_test)
print("Accuracy is :", ac)

RMSE is :  450153.0497289593
Accuracy is : 0.7098686911576344
