In [None]:
"""Data Analysis of Sweden's used car market."""
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

path_to_dataset = "../data/cleaned_used_car_dataset.csv"

used_cars = pd.read_csv(path_to_dataset)
used_cars = used_cars.iloc[:, 1:]
used_cars["publication_datetime"] = pd.to_datetime(
    used_cars["publication_datetime"], format="%Y-%m-%d %H:%M:%S"
)

used_cars.shape

## Questions
1. At what times (day in week and hour of day) are advertisments/ cars published?
2. Which manufacturer and location has highest share on used car market?
3. Which manufacturer has cheaper/more expensive used cars?
4. Which parameters affect the price?
    1. Does the price change differently for different manufacturers depending on entry year of the car?
    2. Does the price vary depending on the quantity of cars?
    3. Does the price vary depending on model and car type?
    4. Does the price vary depending on the location?
5. What are typical prices of a specific car model depending on mileage, entry year, and location?

In [None]:
def remove_rows_below_minimum_share(minimum_share, variable, data):
    """Score the matches between model names.

    Parameters
    ----------
    minimum_share : float
        minimum share of variabel in percent
    variable : str
        name of variable which values should be above minimum share
    data : DataFrame
        dataset

    Returns
    -------
    data : DataFrame
        updated data set without rows below minimum share
    """
    share_of_values = data[variable].value_counts() / data.shape[0] * 100

    values_below_minimum = share_of_values.index[
        share_of_values <= minimum_share
    ].tolist()

    data.drop(
        index=data.index[data[variable].isin(values_below_minimum)],
        inplace=True,
    )

    return data

### 1) At what times (day in week and hour of day) are advertisements/ cars published?

In [None]:
# get all publication date times including the publication history
all_publication_datetimes = used_cars["publication_history"].str.split(", ")
all_publication_datetimes.dropna(inplace=True)
all_publication_datetimes = [
    publication_date
    for list_of_publication_dates in all_publication_datetimes
    for publication_date in list_of_publication_dates
]
all_publication_datetimes = pd.to_datetime(
    all_publication_datetimes, format="%Y-%m-%d %H:%M:%S"
)

# join publication history and most recent publication date time
all_publication_datetimes = [
    *all_publication_datetimes,
    *used_cars["publication_datetime"],
]

df = pd.DataFrame(
    {
        "weekday": pd.Series(all_publication_datetimes).dt.weekday,
        "hour": pd.Series(all_publication_datetimes).dt.hour,
    }
)

g = sns.JointGrid(data=df, x="weekday", y="hour", marginal_ticks=True, height=5)
# Add the joint and marginal histogram plots
g.plot_joint(sns.histplot, discrete=(True, True), element="bars", cmap="Greys")
g.plot_marginals(
    sns.histplot, discrete=(True, True), element="bars", stat="percent", color="#BEBEBE"
)

# fixing xticks with "set_yticks"
ticks_loc = g.ax_joint.get_xticks().tolist()
g.ax_joint.set_xticks(g.ax_joint.get_xticks().tolist())
g.ax_joint.set_xticklabels(
    [
        "",
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
        "",
    ],
    rotation=45,
    ha="right",
)

### 2) Which manufacturer and location has highest share on used car market?

In [None]:
minimum_share_percent = 0.5
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_percent, "manufacturer", used_cars.copy()
)
share_of_manufacturer = (
    reduced_used_cars["manufacturer"].value_counts() / used_cars.shape[0] * 100
)

minimum_share_percent = 0.9
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_percent, "location", used_cars.copy()
)
share_of_location = (
    reduced_used_cars["location"].value_counts() / used_cars.shape[0] * 100
)


f, ax = plt.subplots(1, 2, figsize=(10, 5))

# plot manufacturer shares
sns.barplot(
    x=share_of_manufacturer.values,
    y=share_of_manufacturer.keys(),
    color="#BEBEBE",
    ax=ax[0],
)
ax[0].set(ylabel="", xlabel="Share of manufacturer in %")
ax[0].set_title("Manufacturer")
sns.despine(left=True, bottom=True)


# highlight certain location in plot
color_per_location = [
    "red" if (location == "gävle") else "#BEBEBE"
    for location in share_of_location.keys()
]

# plot location share
sns.barplot(
    x=share_of_location.values,
    y=share_of_location.keys(),
    palette=color_per_location,
    ax=ax[1],
)
ax[1].set(xlabel="Advertisements in %", ylabel="")
ax[1].set_title("Location")
sns.despine(left=True, bottom=True)

### 3) Which manufacturer has cheaper/more expensive used cars?

Luxury cars from manufactureres such as BMW, Mercedes-Benz, Audi, etc. have on average higher prices than cheaper manufacturer brands, such as Peugeot, Renault, Citroen. 

In [None]:
def price_variation_sorted_by_median(data, sort_by):
    """Boxplot with sorted order based median of price.

    Parameters
    ----------
    data : DataFrame
        dataset
    sort_by : str
        name of variable which the values shall be sorted along
    """
    order = (
        data.groupby(by=[sort_by])["price_sek"]
        .median()
        .sort_values(ascending=False)
        .index
    )
    sns.boxplot(x="price_sek", y=sort_by, data=data, order=order)


def price_variation_sorted_by_quantity(data, sort_by):
    """Boxplot with sorted order based number of occurences.

    Parameters
    ----------
    data : DataFrame
        dataset
    sort_by : str
        name of variable which the values shall be sorted along
    """
    order = data[sort_by].value_counts().index
    sns.boxplot(x="price_sek", y=sort_by, data=data, order=order)

In [None]:
minimum_share_of_models = 1
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_of_models, "manufacturer", used_cars.copy()
)
price_variation_sorted_by_median(reduced_used_cars, "manufacturer")

### 4) Which parameter affect the car price?

#### Correlation to other numeric parameters

Car price ...
- has negative correlation to mileage
- has positve correlation to entry year of car (slightly stronger than with mileage)
- is not affected by fuel consumption
- is not affected by co2 emissions

In [None]:
correlation = used_cars.corr()
sns.heatmap(correlation, annot=True, linewidths=0.5, cmap="Blues_r")

#### Correlation to categorical parameters

Car price depends on 
- type of car 
    - luxurious cars (suv, sedan, coupe) are more expenxive than non-luxurious cars (small, hatchback, commerical)
- type of fuel
    - cars with petrol and diesel are generally cheaper than electric cars
    - fossil fuel based cars have similar price range 

Nevertheless, largest impact on price is the entry year. 

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 10))
sns.boxplot(
    x="car_type",
    y="price_sek",
    hue="entry_year",
    data=used_cars,
    palette="Greens",
    ax=ax[0],
)
sns.boxplot(
    x="car_type", y="price_sek", hue="fuel", data=used_cars, palette="Greens", ax=ax[1]
)

sns.relplot(
    x="mileage_km",
    y="price_sek",
    data=used_cars,
    col="car_type",
    col_wrap=5,
    height=4,
)

ax[0].get_legend().remove()

#### 4.1) Does the price change differently for different manufacturers depending on entry year of the car?

There seems to be a difference how prices change between manufacturers depending on the entry year. 
For car manufacturers such as "Volvo", "Volkswagen" or "BMW" the prices are much higher for newer cars compared to "Toyota", "Seat" or "Honda". However, prices for the oldest cars are very similar. 



In [None]:
minimum_share_percent = 1
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_percent, "manufacturer", used_cars.copy()
)

order = reduced_used_cars["manufacturer"].value_counts().index

plt.figure(figsize=(10, 20))
sns.boxplot(
    y="manufacturer",
    x="price_sek",
    hue="entry_year",
    data=reduced_used_cars,
    order=order,
)

#### 4.2) Does the price vary depending on the quantity of cars?

The quantity does not seem to have a strong impact as volvo cars have a high share are not necessarily the cheapest cars. 

A stronger affect lies on the manufacturer and type of car.

In [None]:
minimum_share_of_models = 1
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_of_models, "manufacturer", used_cars.copy()
)
price_variation_sorted_by_quantity(reduced_used_cars, "manufacturer")

#### 4.3) Does the price vary depending on model and car type?

In [None]:
minimum_share_of_models = 1
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_of_models, "manufacturer", used_cars.copy()
)

reduced_used_cars["manufacturer_model"] = (
    reduced_used_cars["manufacturer"] + " " + reduced_used_cars["model"].astype(str)
)

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "small"], "manufacturer"
)
ax.set_title("small car")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "hatchback"], "manufacturer"
)
ax.set_title("hatchback")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "estate car"], "manufacturer"
)
ax.set_title("estate car")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "suv"], "manufacturer"
)
ax.set_title("suv")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "commercial"], "manufacturer"
)
ax.set_title("commercial")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "sedan"], "manufacturer"
)
ax.set_title("sedan")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "van"], "manufacturer"
)
ax.set_title("van")

fig, ax = plt.subplots()
price_variation_sorted_by_median(
    reduced_used_cars[reduced_used_cars["car_type"] == "cab"], "manufacturer"
)
ax.set_title("cab")

#### 4.4) Does the price vary depending on the location?

The location does not seem to have an effect on the car price. For the same entry year of the car the prices are similar between locations in terms of their median for that year. 

The variation between locations for the same year might arise due to the car type.


In [None]:
minimum_share_of_models = 0.9
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_of_models, "location", used_cars.copy()
)


# create a multiindex for sorting
index = pd.MultiIndex.from_product(
    [reduced_used_cars["location"].unique(), reduced_used_cars["entry_year"].unique()],
    names=["location", "entry_year"],
)
prices_per_location = pd.DataFrame(
    index=index, columns=["median_price_sek", "number_cars"]
)

# calculate mean and number of cars per year and location
for location in reduced_used_cars["location"].unique():
    median_price_sek = (
        reduced_used_cars.loc[reduced_used_cars["location"] == location]
        .groupby(by=["entry_year"])["price_sek"]
        .median()
        .rename("median_price_sek")
    )
    number_cars = (
        reduced_used_cars.loc[reduced_used_cars["location"] == location, "entry_year"]
        .value_counts()
        .rename("number_cars")
    )
    next_location = pd.concat([median_price_sek, number_cars], axis=1)

    # add location as another index
    next_location.index = pd.MultiIndex.from_tuples(
        [(location, i) for i in next_location.index]
    )

    prices_per_location.update(next_location)

prices_per_location = prices_per_location.reset_index()


# sort location based on median over all years
order = (
    reduced_used_cars.groupby(by=["location"])["price_sek"]
    .median()
    .sort_values(ascending=False)
    .index
)
prices_per_location["location_cat"] = pd.Categorical(
    prices_per_location["location"], categories=order, ordered=True
)
prices_per_location.sort_values("location_cat", inplace=True)


# plot data
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x="median_price_sek",
    y="location",
    data=prices_per_location,
    size="number_cars",
    hue="entry_year",
)

legend = plt.legend(loc="center right", bbox_to_anchor=(1.2, 0.5))

### 5) What are typical prices of a specific car model depending on mileage, entry year, and location?

In [None]:
car_model = "arona"

reduced_used_cars = used_cars.copy().drop(
    index=used_cars.index[used_cars["model"] != car_model]
)

fig, ax = plt.subplots(1, 2, figsize=(10, 8))
fig.suptitle(
    " ".join(["Car model:", reduced_used_cars["manufacturer"].values[0], car_model]),
    fontsize=16,
)

# plot dependency on mileage
sns.scatterplot(
    x="price_sek",
    y="mileage_km",
    data=reduced_used_cars,
    hue="entry_year",
    ax=ax[0],
    palette="Greens",
)

minimum_share_of_models = 1
reduced_used_cars = remove_rows_below_minimum_share(
    minimum_share_of_models, "location", reduced_used_cars
)

reduced_used_cars = (
    reduced_used_cars.groupby(["location", "entry_year"])["price_sek"]
    .value_counts()
    .rename("counts")
)
reduced_used_cars = reduced_used_cars.to_frame()
reduced_used_cars.reset_index(inplace=True)

# plot dependency on location
sns.scatterplot(
    x="price_sek",
    y="location",
    data=reduced_used_cars,
    hue="entry_year",
    size="counts",
    ax=ax[1],
    palette="Greens",
)

plt.tight_layout()