# Analysis of sold cars in Estonia

Data is taken from official source - https://www.transpordiamet.ee/soidukitega-tehtud-toimingute-statistika

Period - 5 years.

Only new cars where used in this analysis.


In [23]:
%reload_ext autoreload
%autoreload 0

import os

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [24]:
from mnt_sum import get_summary, COLUMNS, COLUMN_SHORT_NAME, COLUMN_REG_DATE, COLUMN_CUSTOMER, PRIVATE_CUSTOMER, COLUMN_ENGINE_TYPE

df_o = pd.concat([get_summary(f"data/{year}") for year in range(2018, 2024)])
df_o = df_o[COLUMNS +[COLUMN_REG_DATE]]
df_o.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99580 entries, 42 to 1532
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Mark             99580 non-null  object 
 1   Mudel            99580 non-null  string 
 2   short name       99580 non-null  object 
 3   Mootori tüüp     99580 non-null  object 
 4   Mootori maht     99580 non-null  Int64  
 5   Mootori võimsus  99580 non-null  Float64
 6   Linn             99580 non-null  object 
 7   Tüüp (isik)      99580 non-null  object 
 8   Arv              99580 non-null  Int64  
 9   Värv             59188 non-null  string 
 10  Esm reg aasta    99580 non-null  Int16  
dtypes: Float64(1), Int16(1), Int64(2), object(5), string(2)
memory usage: 8.9+ MB


In [25]:
df_o.head(5)

Unnamed: 0,Mark,Mudel,short name,Mootori tüüp,Mootori maht,Mootori võimsus,Linn,Tüüp (isik),Arv,Värv,Esm reg aasta
42,ALFA ROMEO,GIULIA,ALFA ROMEO GIULIA,BENSIIN_KATALYSAATOR,1995,206.0,Määramata,JURIIDILINE,1,,2018
43,AUDI,A1 SPORTBACK,AUDI A1,BENSIIN_KATALYSAATOR,1395,110.0,Tallinn,FÜÜSILINE,1,,2018
44,AUDI,A4 AVANT,AUDI A4,BENSIIN_KATALYSAATOR,1984,185.0,Määramata,FÜÜSILINE,1,,2018
45,AUDI,A4 AVANT,AUDI A4,DIISEL,1968,140.0,Tallinn,JURIIDILINE,1,,2018
46,AUDI,A4 LIMOUSINE,AUDI A4,BENSIIN_KATALYSAATOR,1395,110.0,Tartu,FÜÜSILINE,1,,2018


In [26]:

df_models =  df_o.groupby(
        [COLUMN_SHORT_NAME], 
        as_index=False
    )['Arv'].sum().sort_values('Arv', ascending=False).reset_index(drop=True)
df_models.head()

Unnamed: 0,short name,Arv
0,TOYOTA RAV4,7754
1,SKODA OCTAVIA,6046
2,TOYOTA COROLLA,5485
3,SKODA KODIAQ,3581
4,RENAULT CLIO,3580


In [27]:
df_models_year =  df_o.groupby(
        [COLUMN_SHORT_NAME, COLUMN_REG_DATE], 
        as_index=False
    )['Arv'].sum().sort_values('Arv', ascending=False).reset_index(drop=True)

df_models_year.head()

Unnamed: 0,short name,Esm reg aasta,Arv
0,TOYOTA RAV4,2022,1526
1,TOYOTA RAV4,2021,1440
2,TOYOTA RAV4,2020,1400
3,TOYOTA RAV4,2023,1311
4,TOYOTA RAV4,2019,1194


# Marks overview

In [28]:
df_marks_year =  df_o.groupby(
        ['Mark', COLUMN_REG_DATE], 
        as_index=False
    )['Arv'].sum().sort_values('Arv', ascending=False).reset_index(drop=True)

df_marks_year.head()

Unnamed: 0,Mark,Esm reg aasta,Arv
0,TOYOTA,2022,4456
1,TOYOTA,2023,4211
2,TOYOTA,2021,3851
3,TOYOTA,2018,3665
4,TOYOTA,2019,3444


In [29]:
from matplotlib.colors import LinearSegmentedColormap


def sort_and_plot(_df, sortby=2023, limit=20, main_groupby=COLUMN_SHORT_NAME):
    cm = LinearSegmentedColormap.from_list(
        name="ryg",
        colors=["red", "yellow", "green"],
    )

    df_group = (
        _df.groupby([main_groupby, COLUMN_REG_DATE], as_index=False)["Arv"]
        .sum()
        .sort_values("Arv", ascending=False)
        .reset_index(drop=True)
    )

    df_h = (
        df_group.pivot(index=main_groupby, columns=COLUMN_REG_DATE, values="Arv")
        .sort_values([sortby], ascending=False)
        .astype("float64")
    )

    df_h = df_h.head(limit)

    df_h.style.format(precision=0)
    df_colorized = df_h.style.background_gradient(cmap=cm).format(precision=0)

    return df_colorized


sort_and_plot(df_marks_year, main_groupby="Mark")

Esm reg aasta,2018,2019,2020,2021,2022,2023
Mark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TOYOTA,3665.0,3444,3362,3851,4456,4211
SKODA,2840.0,3079,2884,3157,2769,3358
AUDI,385.0,390,496,539,1000,2544
KIA,1590.0,1529,1259,1867,1636,1813
VOLKSWAGEN,1690.0,1726,1197,1968,1595,1228
PEUGEOT,1128.0,1055,1173,1393,816,880
RENAULT,2592.0,2409,2021,1152,784,873
DACIA,943.0,1080,706,742,725,750
SUBARU,794.0,798,347,612,442,746
HYUNDAI,862.0,571,621,971,681,690


# Total overview

In [30]:
sort_and_plot(df_models_year)

Esm reg aasta,2018,2019,2020,2021,2022,2023
short name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TOYOTA RAV4,883.0,1194.0,1400.0,1440,1526,1311
TOYOTA COROLLA,378.0,1091.0,977.0,922,966,1151
SKODA OCTAVIA,911.0,1031.0,1054.0,1110,888,1052
SKODA KODIAQ,462.0,675.0,536.0,480,691,737
AUDI Q2,30.0,24.0,18.0,17,229,736
AUDI A3,26.0,12.0,11.0,15,34,642
KIA SPORTAGE,602.0,571.0,399.0,633,641,599
KIA CEED,533.0,511.0,403.0,664,512,553
TOYOTA YARIS CROSS,,,,49,498,510
SUBARU OUTBACK,437.0,421.0,171.0,348,314,493


# Private owners

In [31]:
df_private = df_o[df_o[COLUMN_CUSTOMER].apply(lambda x: PRIVATE_CUSTOMER in x)]

sort_and_plot(df_private)

Esm reg aasta,2018,2019,2020,2021,2022,2023
short name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TOYOTA RAV4,557.0,736.0,694.0,736,783,771
TOYOTA COROLLA,203.0,469.0,458.0,431,301,453
SKODA OCTAVIA,420.0,482.0,598.0,492,289,373
SKODA KODIAQ,266.0,390.0,330.0,249,321,350
TOYOTA YARIS CROSS,,,,22,293,348
KIA SPORTAGE,350.0,287.0,202.0,405,350,294
SUBARU OUTBACK,240.0,203.0,99.0,205,174,270
SKODA KAMIQ,,45.0,242.0,187,131,228
SKODA KAROQ,269.0,294.0,170.0,230,151,208
DACIA DUSTER,251.0,219.0,158.0,182,188,190


# Private owners electric cars

In [32]:
df_private_e = df_private[df_private[COLUMN_ENGINE_TYPE] == 'ELEKTER']

sort_and_plot(df_private_e)

Esm reg aasta,2018,2019,2020,2021,2022,2023
short name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TESLA MODEL Y,,,,1.0,1.0,63
SKODA ENYAQ,,,,22.0,27.0,59
KIA EV6,,,,,3.0,25
HYUNDAI IONIQ5,,,,4.0,11.0,22
VOLKSWAGEN ID.4,,,,6.0,3.0,19
TESLA MODEL 3,,,11.0,5.0,11.0,18
NISSAN LEAF,10.0,19.0,23.0,7.0,11.0,17
AUDI Q4,,,,,1.0,17
NISSAN ARIYA,,,,,,15
VOLVO XC40,,,,,11.0,14


# Private owners hybrid cars

In [33]:
df_private_h = df_private[df_private[COLUMN_ENGINE_TYPE].apply(lambda x: 'HYBRIID' in x)]

sort_and_plot(df_private_h)

Esm reg aasta,2018,2019,2020,2021,2022,2023
short name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TOYOTA COROLLA,,207.0,213.0,276.0,187.0,427
TOYOTA RAV4,279.0,314.0,291.0,347.0,357.0,412
TOYOTA YARIS CROSS,,,,13.0,203.0,330
KIA SPORTAGE,,,1.0,29.0,222.0,247
TOYOTA C-HR,177.0,155.0,120.0,218.0,196.0,153
HONDA CR-V,,76.0,79.0,179.0,210.0,152
SKODA OCTAVIA,,,30.0,135.0,67.0,144
NISSAN QASHQAI,,,,78.0,156.0,143
HYUNDAI TUCSON,,,8.0,151.0,177.0,133
TOYOTA YARIS,,,18.0,69.0,66.0,76
