# Objective
Predict resale prices of BMW cars.

In [None]:
import numpy as np
import pandas as pd

# Loading and inspecting data
First we load and inspect the data. I downloaded the data from [here](https://raw.githubusercontent.com/datacamp/careerhub-data/master/BMW%20Used%20Car%20Sales/bmw.csv) and saved it in the `datasets/bmw.csv` file.

In [None]:
bmw = pd.read_csv('datasets/bmw.csv')
bmw.head()

In [None]:
bmw.info()

In [None]:
bmw.model.unique()

In [None]:
bmw.transmission.unique()

In [None]:
bmw.fuelType.unique()

In [None]:
bmw.describe()

In [None]:
for col in bmw:
    print(col, len(bmw[col].unique()))

I reqognize the three columns `model`, `transmission` and `fuelType` as categorical axes, and convert them as such. I could also convert the `year` column to the datetime type, but since I don't need to handle months or days for this analysis, I chose to just keep it as an integer column.

In [None]:
categorical_columns= ['model', 'fuelType', 'transmission']
bmw_typed = bmw.astype({column: "category" for column in categorical_columns})
#bmw_typed["year"] = pd.to_datetime(bmw_typed["year"])
bmw_typed.info()

# Data cleaning
Let us take a closer look at the categorical axes. First we print the number of values in each category

In [None]:
def print_categorical_counts(df, columns):
    for col in columns:
        display(df.groupby(col)[col].count())

print_categorical_counts(bmw, categorical_columns)

There are  have a number of categories with very few records. For instance, the `fuelType` `Electric` has only 3 three records. With such a small amount of observations for this category I wouldn't expect it to be possible to make reliable predictions for the selling price for this category. I therefore choose to drop any category with less than 10 records. 

In [None]:
def drop_almost_empty_categories(df, col, nmin=10):
    df = df.copy() # To avoid modyfiyng the input dataframe
    category_count = df.groupby(col)[col].count()
    for category_name, count  in category_count.iteritems():
        if count < nmin:
            df = df[df[col] != category_name]
    return df

bmw_dropped = bmw.copy()
for col in categorical_columns:
    bmw_dropped = drop_almost_empty_categories(bmw_dropped, col)
print_categorical_counts(bmw_dropped, categorical_columns)

# Data exploration

In [None]:
import seaborn as sns

In [None]:
bmw_copy = bmw_dropped.copy()
#bmw_copy['ln mileage'] = np.log(bmw_copy['mileage'])
#bmw_copy['-mileage'] = -bmw_copy['mileage']
#bmw_copy['10^mileage'] = np.exp(bmw_copy['mileage'])
bmw_copy['ln price'] = np.log(bmw_copy['price'])
#bmw_copy = bmw_copy.drop('mileage', axis='columns')

sns.pairplot(bmw_copy, #hue='transmission', 
             x_vars=['year', 'mileage',  'tax', 'mpg', 'engineSize'],
             #x_vars=['model', 'fuelType', 'transmission', ],
             y_vars=['price', 'ln price'], hue='transmission')

In [None]:
sns.violinplot(x='transmission', y='price', data=bmw_copy)

In [None]:
sns.violinplot(x='transmission', y='price', data=bmw)

In [None]:
sns.violinplot(hue='transmission', y='price', x='fuelType', data=bmw)

In [None]:
sns.countplot(x="model",  data=bmw)

In [None]:
sns.countplot(x="fuelType",  data=bmw)

In [None]:
sns.catplot(x="fuelType", hue="transmission",
                data=bmw, kind="count",)

In [None]:
bmw.groupby("fuelType")["fuelType"].count()

In [None]:
bmw.groupby("model")["model"].count()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 6))
sns.violinplot(y='price', x='model', data=bmw, aspect=2)

In [None]:
bmw[bmw["fuelType"]=='Electric']

In [None]:
bmw[bmw["fuelType"]=='Other']

In [None]:
bmw[bmw["mpg"] > 400]