In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno

from core.src.constants import CORE_FORMATTED_CSV, ROCAR_CSV

In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))
df = pd.read_csv(
    CORE_FORMATTED_CSV,
    dtype={
        "unique_id": str,
        "price": int,
        "marca": str,
        "model": str,
        "anul producției": int,
        "km": int,
        "putere": int,
        "capacitate cilindrica": int,
        "combustibil": str,
        "tip caroserie": str,
        "is_automatic": bool,
        "firma": bool,
        "transmisie": str,
        "input": str,
    },
)

In [None]:
df.dtypes

In [None]:
msno.bar(df)

In [None]:
CATEGORICAL_COLUMNS = [
    "marca",
    "combustibil",
    "tip caroserie",
    "is_automatic",
    "firma",
    "transmisie",
    "anul producției",
]

In [None]:
def plot_distribution_on_categorised_columns(df):
    for column in CATEGORICAL_COLUMNS:
        print(f"{column}: {df[column].value_counts(dropna=False)}")
        plt.figure(figsize=(10, 8))
        sns.countplot(x=column, data=df, order=df[column].value_counts().index)
        plt.xticks(rotation=90)
        plt.show()


plot_distribution_on_categorised_columns(df)

In [None]:
df["price_bin"] = pd.cut(
    df["price"],
    bins=np.arange(0, df["price"].max() + 20000, 20000),
    labels=np.arange(0, df["price"].max(), 20000),
)

plt.figure(figsize=(10, 8))
sns.countplot(x="price_bin", data=df, order=df["price_bin"].value_counts().index)
plt.xticks(rotation=90)
plt.show()

In [None]:
# remove outliers that are above 100k
df = df[df["price"] <= 100000]

# plot the distribution again but with 5k bins
df["price_bin"] = pd.cut(
    df["price"],
    bins=np.arange(0, df["price"].max() + 5000, 5000),
    labels=np.arange(0, df["price"].max(), 5000),
)

plt.figure(figsize=(10, 8))
sns.countplot(x="price_bin", data=df, order=df["price_bin"].value_counts().index)
plt.xticks(rotation=90)
plt.show()

df = df.drop(columns=["price_bin"])

In [None]:
df.to_csv(ROCAR_CSV, index=False)

## Plot the distribution of the price related to the categorical columns

In [None]:
for column in CATEGORICAL_COLUMNS:
    plt.figure(figsize=(10, 8))
    sns.boxplot(x=column, y="price", data=df)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# show the distribution of km related to price, binned in 40k intervals
df["km_bin"] = pd.cut(
    df["km"],
    bins=np.arange(0, df["km"].max() + 40000, 40000),
    labels=np.arange(0, df["km"].max(), 40000),
)

sns.boxplot(x="km_bin", y="price", data=df)
plt.xticks(rotation=90)

df = df.drop(columns=["km_bin"])

In [None]:
df["putere_bin"] = pd.cut(
    df["putere"],
    bins=np.arange(0, df["putere"].max() + 50, 50),
    labels=np.arange(0, df["putere"].max(), 50),
)

sns.boxplot(x="putere_bin", y="price", data=df)
plt.xticks(rotation=90)

df = df.drop(columns=["putere_bin"])

In [None]:
df["capacitate cilindrica_bin"] = pd.cut(
    df["capacitate cilindrica"],
    bins=np.arange(0, df["capacitate cilindrica"].max() + 500, 500),
    labels=np.arange(0, df["capacitate cilindrica"].max(), 500),
)

sns.boxplot(x="capacitate cilindrica_bin", y="price", data=df)
plt.xticks(rotation=90)

df = df.drop(columns=["capacitate cilindrica_bin"])

In [None]:
msno.bar(df)

# Text analysis

## Preprocess the text

In [None]:
from utils.format import preprocess_text

df["input"] = df["input"].apply(preprocess_text)

## Analyze the text

In [None]:
from collections import defaultdict
from tqdm import tqdm
from langdetect import detect

languages = {}

texts = df["input"].astype(str)
language_indexes = defaultdict(list)

for i, text in enumerate(tqdm(texts)):
    detected = detect(text)
    language_indexes[detected].append(i)
    if detected in languages.keys():
        languages[detected] += 1
    else:
        languages[detected] = 1

print(languages)

In [None]:
for k, v in language_indexes.items():
    if k != "ro":
        print(k)
        for value in v[:1]:
            print(df["input"][value])

In [None]:
# remove the non-romanian texts
initial_len = len(df)
df = df[df.index.isin(language_indexes["ro"])]
print(f"Removed {initial_len - len(df)} non-romanian texts")

In [None]:
from wordcloud import WordCloud

all_descriptions = " ".join(df["input"])

wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_descriptions)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
df.to_csv(ROCAR_CSV, index=False)