In [None]:
import numpy as np
import pandas as pd


In [None]:
df_raw = pd.read_csv('true_car_listings.csv')

In [None]:
df_raw.head()

In [None]:
df_raw.info()

In [None]:
df_raw.describe()

## EDA and Data Preprocessing

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Let's have a quick look on data.

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(1, 3, 1)
plt.hist(df_raw['Price'], bins=100)
plt.title('Price')

plt.subplot(1, 3, 2)
plt.hist(df_raw['Mileage'], bins=100)
plt.title('Mileage')

plt.subplot(1, 3, 3)
plt.hist(df_raw['Year'], bins=100)
plt.title('Year')


In [None]:
plt.figure(figsize=(18, 8))
sns.countplot(x=df_raw['State'], order=df_raw['State'].value_counts().index)

We see that there are outliers in data. Let's set up a constraint of 3 standard deviations for price and mileage.

In [None]:
from scipy import stats

std_dev = 3
df_clean = df_raw[(np.abs(stats.zscore(df_raw[['Price', 'Mileage']])) < float(std_dev)).all(axis=1)]

In [None]:
df_clean.info()

Now let's check the data for missing values.

In [None]:
df_clean.isnull().values.any()

Let's check the distribution once again

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(1, 3, 1)
plt.hist(df_clean['Price'], bins=100)
plt.title('Price')

plt.subplot(1, 3, 2)
plt.hist(df_clean['Mileage'], bins=100)
plt.title('Mileage')

plt.subplot(1, 3, 3)
plt.hist(df_clean['Year'], bins=100)
plt.title('Year')

Now the distribution looks much better. 

In [None]:
df_raw['City'].nunique()

We have got 2553 unique cities, that's quite a lot. We need to encode it in some way so that does not affect training. To do this let's get population information for the cities from data.world and merge it to our dataframe.

In [None]:
us_cities = pd.read_excel('US City Populations.xlsx')
us_cities.drop('State', axis=1, inplace=True)
us_cities.sort_values('Population', ascending=False)
us_cities.head()

In [None]:
us_cities.drop_duplicates(subset='City', keep='first', inplace=True)
us_cities.info()

In [None]:
df_clean = df_clean.merge(us_cities, on='City', how='left')
df_clean.head()

In [None]:
df_clean.Population.isna().value_counts()

In [None]:
df_clean.Population.fillna(df_clean['Population'].median(), inplace=True)

In [None]:
df_clean.Population.isna().value_counts()

Now let's encode the city column with 1 for city population more than 50000 and 0 for less 50000.

In [None]:
df_clean['City'] = np.where(df_clean['Population'] > 50000, 1, 0)
df_clean.drop(['Vin', 'Population'], axis=1, inplace=True)
df_clean.head()

Let's now look for some correlations between numeric variables. We would better take a particular model for that so not to get a mess. But first, we need to clean the names.

In [None]:
df_clean['State'] = df_clean['State'].str.lower()

In [None]:
df_clean['Make'] = df_clean['Make'].str.lower()

In [None]:
df_clean['Model'] = df_clean['Model'].str.lower()
df_clean['Model'] = df_clean['Model'].str.replace(' ', '')
df_clean['Model'] = df_clean['Model'].str.replace('-', '')

In [None]:
honda = df_clean[(df_clean.Make == 'honda') & (df_clean.Model == 'accord')]

In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x='Mileage', y='Price', data=honda, marker='x')

In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x='Year', y='Price', data=honda, marker='x')

We see that linearity is quite obvious.

## Categorical data encoding and splitting for train and test

Let's encode the categorical data (state, make and model).


There are a lot of unique rows in categorical features, so one-hot encoding will create a huge dataframe. So, we'd better use label encoding. It will not be great for linear regression, but tree-based models know how to handle it.

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df_clean['State'] = encoder.fit_transform(df_clean['State'])

In [None]:
df_clean['Make'] = encoder.fit_transform(df_clean['Make'])
df_clean['Model'] = encoder.fit_transform(df_clean['Model'])

In [None]:
df_clean.head()

Now we can split the data for train and test.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_clean.drop('Price', axis=1)
y = df_clean['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling

As soon as we are going to use not only a linear regression baseline, but also tree-based models, we will use MinMaxScaler (not StandardScaler) to scale numeric columns.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train.iloc[:, 0:2] = scaler.fit_transform(X_train.iloc[:, 0:2])

In [None]:
X_test.iloc[:, 0:2] = scaler.transform(X_test.iloc[:, 0:2])