In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras import utils
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, SpatialDropout1D, BatchNormalization, Embedding, Flatten, Activation, Input, concatenate
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, Adadelta, SGD, Adagrad, RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import StandardScaler

import re
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

import json

# Data

## Download

In [18]:
url = "https://storage.yandexcloud.net/academy.ai/japan_cars_dataset.csv"
df = pd.read_csv(url)
df.to_csv('japan_cars_dataset.csv', index=False)

In [19]:
df = df.dropna()

df.head(10)

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
0,0,80,nissan,march,2003,80000,1240,at,2wd,rhd,gasoline
1,1,110,nissan,march,2010,53000,1200,at,2wd,rhd,gasoline
2,2,165,nissan,lafesta,2005,47690,2000,at,2wd,rhd,gasoline
3,3,190,toyota,avensis,2008,130661,1990,at,2wd,rhd,gasoline
4,4,190,daihatsu,mira,2006,66300,660,at,2wd,rhd,gasoline
5,5,190,daihatsu,mira,2004,81400,660,at,2wd,rhd,gasoline
6,8,220,nissan,march,2010,117000,1200,at,2wd,rhd,gasoline
7,9,230,volkswagen,passat,2008,127763,3190,at,4wd,rhd,gasoline
8,10,275,mazda,bongo van,2010,178218,1800,mt,2wd,rhd,gasoline
9,11,283,honda,step wgn,2005,121655,2000,at,2wd,rhd,gasoline


In [21]:
df.drop(['Unnamed: 0'], inplace=True, axis=1)
df.head()

Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
0,80,nissan,march,2003,80000,1240,at,2wd,rhd,gasoline
1,110,nissan,march,2010,53000,1200,at,2wd,rhd,gasoline
2,165,nissan,lafesta,2005,47690,2000,at,2wd,rhd,gasoline
3,190,toyota,avensis,2008,130661,1990,at,2wd,rhd,gasoline
4,190,daihatsu,mira,2006,66300,660,at,2wd,rhd,gasoline


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2318 entries, 0 to 2317
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   price            2318 non-null   int64 
 1   mark             2318 non-null   object
 2   model            2318 non-null   object
 3   year             2318 non-null   int64 
 4   mileage          2318 non-null   int64 
 5   engine_capacity  2318 non-null   int64 
 6   transmission     2318 non-null   object
 7   drive            2318 non-null   object
 8   hand_drive       2318 non-null   object
 9   fuel             2318 non-null   object
dtypes: int64(4), object(6)
memory usage: 181.2+ KB


## Preprocessing

In [23]:
df[['price', 'year', 'mileage', 'engine_capacity']].describe()

Unnamed: 0,price,year,mileage,engine_capacity
count,2318.0,2318.0,2318.0,2318.0
mean,971.522433,2005.97239,100013.194996,1507.010785
std,288.673112,3.698863,52512.478883,549.58517
min,80.0,1979.0,2000.0,9.0
25%,776.0,2004.0,67000.0,1300.0
50%,1000.0,2006.0,94000.0,1490.0
75%,1213.0,2009.0,124000.0,1800.0
max,1400.0,2015.0,790000.0,12340.0


In [24]:
df = df[df['year'] >= 1980]
df = df[df['mileage'] <= 500_000]
df = df[(df['engine_capacity'] >= 500) & (df['engine_capacity'] <= 7000)]
df['price_log'] = np.log1p(df['price'])

In [25]:
df[['price', 'year', 'mileage', 'engine_capacity']].describe()

Unnamed: 0,price,year,mileage,engine_capacity
count,2312.0,2312.0,2312.0,2312.0
mean,971.359429,2005.977509,99104.112889,1501.778547
std,288.42284,3.657374,47255.334275,496.02551
min,80.0,1982.0,2000.0,550.0
25%,775.0,2004.0,67000.0,1300.0
50%,1000.0,2006.0,94000.0,1490.0
75%,1213.0,2009.0,124000.0,1800.0
max,1400.0,2015.0,497408.0,4500.0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2312 entries, 0 to 2317
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   price            2312 non-null   int64  
 1   mark             2312 non-null   object 
 2   model            2312 non-null   object 
 3   year             2312 non-null   int64  
 4   mileage          2312 non-null   int64  
 5   engine_capacity  2312 non-null   int64  
 6   transmission     2312 non-null   object 
 7   drive            2312 non-null   object 
 8   hand_drive       2312 non-null   object 
 9   fuel             2312 non-null   object 
 10  price_log        2312 non-null   float64
dtypes: float64(1), int64(4), object(6)
memory usage: 216.8+ KB


In [27]:
cat_features = ['mark', 'model', 'transmission', 'drive', 'hand_drive', 'fuel']

In [28]:
model_counts = df['model'].value_counts()
rare_models = model_counts[model_counts < 5].index
df['model'] = df['model'].replace(rare_models, 'other')

In [29]:
df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

In [31]:
num_features = ['year', 'mileage', 'engine_capacity']

In [32]:
scaler = StandardScaler()
df_encoded[num_features] = scaler.fit_transform(df_encoded[num_features])

In [33]:
df_encoded.head()

Unnamed: 0,price,year,mileage,engine_capacity,price_log,mark_bmw,mark_chrysler,mark_citroen,mark_daihatsu,mark_ford,...,model_x-trail,transmission_cvt,transmission_mt,drive_4wd,drive_awd,hand_drive_rhd,fuel_diesel,fuel_gasoline,fuel_hybrid,fuel_lpg
0,80,-0.814287,-0.404362,-0.527866,4.394449,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
1,110,1.100068,-0.975849,-0.608525,4.70953,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
2,165,-0.267328,-1.088242,1.004644,5.111988,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
3,190,0.55311,0.66794,0.98448,5.252273,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
4,190,0.006151,-0.694339,-1.697414,5.252273,False,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
