[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/jkanclerz/data-science-workshop-2024/blob/main/99--exercises/23--cars.ipynb)

## Jobs 

In [None]:
!mkdir -p var/

In [None]:
!wget https://data.edu.jkanclerz.com/data-science/uek/datasets/cars.zip -O var/cars.zip

In [None]:
!unzip var/cars.zip -d var/

In [None]:
pip install pandas

In [None]:
import glob
import json
import pandas as pd 

In [None]:
cars = glob.glob('./var/cars/**/*.json', recursive=True)

In [None]:
cars_as_json = (json.load(open(path)) for path in cars)

In [None]:
df = pd.DataFrame(cars_as_json)
data = df

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df = data

In [None]:
df = df.rename(columns={
    "Marka pojazdu": "marka",
    "Model pojazdu": "model",
    "Rok produkcji": "rok_produkcji",
    "Zarejestrowany w Polsce": "rejstracja_polska",
    "Przebieg": "przebieg",
    "Rodzaj paliwa": "paliwo",
    "Kraj pochodzenia": "kraj_pochodzenia",
    "Pierwszy właściciel": "prierwszy_wlasciciel"
})

In [None]:
df.columns

In [None]:
df.head()

## Understand the data by observing a few rows

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head()

In [None]:
df.tail()

### dtypes, values, shape

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df = df[['id', 'marka', 'model', 'price', 'rok_produkcji', 'przebieg', "kraj_pochodzenia", "paliwo", "prierwszy_wlasciciel"]]

In [None]:
df.dtypes

## Clearing transformations

In [None]:
df = df.replace('', None)

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
def clear_millage(millage_as_str):
    return int(str(millage_as_str)
               .replace('km', '')
               .replace(' ', '')
           )

assert clear_millage('172 000 km') == 172000

In [None]:
df['przebieg'] = df['przebieg'].apply(clear_millage)

In [None]:
df['price'] = df['price'].apply(float)
df['rok_produkcji'] = df['rok_produkcji'].apply(int)

In [None]:
df['year_as_dt'] = pd.to_datetime(df['rok_produkcji'], format='%Y')

In [None]:
df.describe().T

In [None]:
df['przebieg'].quantile(q=0.95), df['przebieg'].quantile(q=0.99)

In [None]:
df['price'].quantile(q=0.95), df['price'].quantile(q=0.99)

In [None]:
df = df[df['przebieg'] < df['przebieg'].quantile(q=0.99)]

In [None]:
df.dtypes

## Save intermediate

In [None]:
df.to_csv("var/cars.csv")

In [None]:
loaded_csv = pd.read_csv("var/cars.csv")

In [None]:
loaded_csv.dtypes

In [None]:
pip install fastparquet pyarrow

In [None]:
df.to_parquet('cars.parquet.gzip', compression='gzip')

In [None]:
loaded_parquet = pd.read_parquet('cars.parquet.gzip')

In [None]:
loaded_parquet.dtypes

In [None]:
cars = loaded_parquet

In [None]:
cars.describe().T

In [None]:
cars[['marka', 'model']].describe()

## Overview single feature - Univariate Analysis

In [None]:
pip install matplotlib seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
def histogram_boxplot(data, feature, figsize = (12, 7), kde = False, bins = None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows = 2,     
        sharex = True,
        gridspec_kw = {"height_ratios": (0.25, 0.75)},
        figsize = figsize,
    )  
    sns.boxplot(
        data = data, x = feature, ax = ax_box2, showmeans = True, color = "blue"
    ) 
    sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2, bins = bins, palette = "winter"
    ) if bins else sns.histplot(
        data = data, x = feature, kde = kde, ax = ax_hist2
    )                   # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color = "green", linestyle = "--"
    )                   # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color = "red", linestyle = "-"
    )   

In [None]:
histogram_boxplot(cars, 'przebieg')

In [None]:
histogram_boxplot(cars, 'price')

In [None]:
sns.displot(np.log(cars["price"]), kde=False, height=7, aspect=12/7)
plt.show()

### transform to log? 

In [None]:
cars['price_log'] = np.log(cars['price'])

In [None]:
cars

## Univariate analysis - Categorical Data

In [None]:
category_counts = df['marka'].value_counts()
top_brands = cars[cars['marka'].isin(category_counts.head(15).index)]

In [None]:
plt.figure(figsize=(12, 7)) 

sns.countplot(x='marka', data=top_brands, hue='marka')

plt.xticks(rotation=90)
plt.title('Distribution of Brand Variable')
plt.xlabel('Brand')
plt.ylabel('Count')

In [None]:
def perc_on_bar(data, z):
    total = len(data[z])
    plt.figure(figsize=(12, 5))
    palette = sns.color_palette("Set2", n_colors=data[z].nunique())
           
    ax = sns.countplot(data=data, x=z, hue=z, order=data[z].value_counts().index)
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height() / total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height() - 1

        ax.annotate(percentage, (x, y), size=12, ha='center')  # Annotate the percentage with center alignment
    plt.xticks(rotation=90)
    plt.show()

In [None]:
perc_on_bar(top_brands, 'marka')

In [None]:
plt.figure(figsize=(12, 7)) 

sns.countplot(x='paliwo', data=cars, hue="paliwo")

plt.xticks(rotation=90)

## Bivariate Analysis

In [None]:
cars

### Scatter plot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 7))

cars.plot(x = 'price_log', y = 'year_as_dt', style = '.', ax=axes[0])
axes[0].set_title('Price vs Year')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Year')

cars.plot(x = 'price', y = 'year_as_dt', style = '.', ax=axes[1])
axes[1].set_title('Log(Price) vs Year')
axes[1].set_xlabel('Log(Price)')
axes[1].set_ylabel('Year')




In [None]:
cars.plot(x = 'przebieg', y = 'year_as_dt', style = '.')

In [None]:
cars.plot(x = 'przebieg', y = 'price', style = '.')

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
plt.figure(figsize=(12, 8))
scatter_matrix(cars, figsize=(12, 8), diagonal='hist', alpha=0.7)

## Heat map

In [None]:
plt.figure(figsize = (12, 7))

sns.heatmap(cars.drop(['model', 'marka', 'id'], axis = 1).corr(numeric_only = True), annot = True, vmin = -1, vmax = 1)

plt.show()

## Box plot

In [None]:
def boxplot(z):
    plt.figure(figsize = (12, 5))
    sns.boxplot(x = z, y = cars['price'])
    plt.yticks(range(0, int(cars['price'].max()) + 1000000, 1000000))
    plt.show()
    
    plt.figure(figsize = (12, 5))
    plt.title('No Outliers')
    sns.boxplot(x = z, y = cars['price'], showfliers = False)
    plt.show()

In [None]:
boxplot(cars['paliwo'])

### Building Various Models

In [None]:
cars.head(1)

In [None]:
data = cars[['marka', 'rok_produkcji', 'przebieg', 'paliwo']]

In [None]:
data.describe().T

In [None]:
data[['paliwo']].describe()

In [None]:
pd.get_dummies(data, drop_first=True).head(1)

In [None]:
pd.get_dummies(data).head(1)

In [None]:
X = pd.get_dummies(data, drop_first=True)

In [None]:
y = cars[['price_log', 'price']]

In [None]:
pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Step-3 Splitting data into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

(X_train.shape, X_test.shape), (y_train.shape, y_test.shape)

In [None]:
from sklearn import metrics

def get_model_score(model, flag = True):

    pred_train = model.predict(X_train)
    pred_train_ = np.exp(pred_train)
    pred_test = model.predict(X_test)
    pred_test_ = np.exp(pred_test)
    
    score_list = []
    train_r2 = metrics.r2_score(y_train['price'], pred_train_)
    test_r2 = metrics.r2_score(y_test['price'], pred_test_)
    train_rmse = metrics.mean_squared_error(y_train['price'], pred_train_)
    test_rmse = metrics.mean_squared_error(y_test['price'], pred_test_)
    score_list.extend((train_r2, test_r2, train_rmse, test_rmse))

    print("R-sqaure on training set : ", metrics.r2_score(y_train['price'], pred_train_))
    print("R-square on test set : ", metrics.r2_score(y_test['price'], pred_test_))
    print("RMSE on training set : ", np.sqrt(metrics.mean_squared_error(y_train['price'], pred_train_)))
    print("RMSE on test set : ", np.sqrt(metrics.mean_squared_error(y_test['price'], pred_test_)))

    # Returning the list with train and test scores
    return score_list

### LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train['price_log'])

In [None]:
LR_score = get_model_score(lr)

In [None]:
X.sample(3, random_state=1)

In [None]:
to_be_predicted = X.sample(1, random_state=1)

In [None]:
to_be_predicted

In [None]:
[np.exp(value) for value in lr.predict(to_be_predicted)]

In [None]:
y.iloc[8090]

### Decision Tree
https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html

### Random Forest
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html