In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error


In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [5]:
train.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [6]:
print(train.shape)
print(test.shape)

(19237, 18)
(8245, 18)


In [13]:
(train.isna().sum() * 100 / train.shape[0]).sort_values(ascending=False)

Airbags             0.0
Color               0.0
Price               0.0
Levy                0.0
Manufacturer        0.0
Model               0.0
Prod. year          0.0
Category            0.0
Leather interior    0.0
Fuel type           0.0
Engine volume       0.0
Mileage             0.0
Cylinders           0.0
Gear box type       0.0
Drive wheels        0.0
Doors               0.0
Wheel               0.0
ID                  0.0
dtype: float64

In [29]:
train[train['Levy'] == '-'].head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
9,45756839,26657,-,LEXUS,RX 350,2007,Jeep,Yes,Petrol,3.5,128500 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
11,45814819,8781,-,FORD,Transit,1999,Microbus,No,CNG,4,0 km,8.0,Manual,Rear,02-Mar,Left wheel,Blue,0
12,45815568,3000,-,OPEL,Vectra,1997,Goods wagon,No,CNG,1.6,350000 km,4.0,Manual,Front,04-May,Left wheel,White,4
23,45814106,7840,-,FORD,Transit,2001,Microbus,No,Diesel,2.0 Turbo,230000 km,4.0,Manual,Front,02-Mar,Left wheel,White,0


In [44]:
train_cpy = train
train_cpy['Levy'] = train_cpy.Levy.str.replace('-', '0')
train_cpy.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399.0,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018.0,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862.0,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446.0,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [45]:
train_cpy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [59]:
train_base = train_cpy[['Price', 'Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type',
                       'Cylinders', 'Gear box type', 'Color', 'Airbags']]
test_base = test[['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type',
                       'Cylinders', 'Gear box type', 'Color', 'Airbags']]

In [58]:
train_base.head()

Unnamed: 0,Price,Manufacturer,Model,Category,Leather interior,Fuel type,Cylinders,Gear box type,Color,Airbags
0,13328,LEXUS,RX 450,Jeep,Yes,Hybrid,6.0,Automatic,Silver,12
1,16621,CHEVROLET,Equinox,Jeep,No,Petrol,6.0,Tiptronic,Black,8
2,8467,HONDA,FIT,Hatchback,No,Petrol,4.0,Variator,Black,2
3,3607,FORD,Escape,Jeep,Yes,Hybrid,4.0,Automatic,White,0
4,11726,HONDA,FIT,Hatchback,Yes,Petrol,4.0,Automatic,Silver,4


In [64]:
train_cat = train_base.select_dtypes(include='O').keys()
train_cat

Index(['Manufacturer', 'Model', 'Category', 'Leather interior', 'Fuel type',
       'Gear box type', 'Color'],
      dtype='object')

In [67]:
X = train_cpy[['Cylinders', 'Airbags']]

y = train_cpy['Price']

In [70]:
# Split the dataset to train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

In [72]:
# Fit the model onto the training dataset X_train = input features, y_train = predictory feature
lm_model = LinearRegression(normalize=True)

lm_model.fit(X_train, y_train)

predictions = lm_model.predict(X_test)

LinearRegression(normalize=True)

In [80]:
np.sqrt(mean_squared_log_error(y_test, y_test_preds))

1.746996078980249

In [83]:
test_cpy = test

test_feature = test_cpy[['Cylinders', 'Airbags']]

test_feature.head()

Unnamed: 0,Cylinders,Airbags
0,4,10
1,4,10
2,4,8
3,6,12
4,4,0


In [95]:
test_predictions = lm_model.predict(test_feature)

test_predictions

array([15461.48646675, 15461.48646675, 17082.58058643, ...,
       29472.12729291, 20324.76882577, 13840.39234708])

In [96]:
output = pd.DataFrame({'Price' : test_predictions})

output

Unnamed: 0,Price
0,15461.486467
1,15461.486467
2,17082.580586
3,16792.977461
4,23566.957065
...,...
8240,18703.674706
8241,18703.674706
8242,29472.127293
8243,20324.768826


In [97]:
output.to_csv('./my_submission_file.csv', index=False)