In [None]:
from constants import ROCAR_CSV
from langdetect import detect
import pandas as pd
from tqdm import tqdm

df = pd.read_csv(ROCAR_CSV)

In [None]:
from collections import defaultdict

languages = {}

texts = df["description"].astype(str)
failed_indexes = []
language_indexes = defaultdict(list)

for i, text in enumerate(tqdm(texts)):
    try:
        if text == "nan":
            continue
        detected = detect(text)
        language_indexes[detected].append(i)
        if detected in languages.keys():
            languages[detected] += 1
        else:
            languages[detected] = 1
    except Exception as e:
        print(e)
        failed_indexes.append(i)

print(languages)
print(failed_indexes)
print(language_indexes)

# {'ro': 29605, 'tl': 1075, 'en': 619, 'it': 524, 'fr': 14, 'de': 5, 'ca': 27, 'hr': 1, 'pt': 3, 'af': 5, 'lt': 1, 'sl': 1, 'es': 3, 'sv': 1, 'tr': 1, 'id': 1, 'sq': 1, 'sk': 1, 'cy': 1, 'nl': 1}

In [None]:
# print all description at the indexes present in the languages_indexes dictionary where the key != ro, waiting for a key input before showing the next one

for k, v in language_indexes.items():
    if k != "ro":
        print(k)
        for value in v[:1]:
            print(df["description"][value])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno

from core.src.constants import CORE_FORMATTED_CSV, ROCAR_CSV

sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))
df = pd.read_csv(ROCAR_CSV)

In [None]:
msno.bar(df)

In [None]:
# print average len of description column
print(df["description"].astype(str).replace("nan", "").apply(len).mean())

In [None]:
# concatenate the following columns to description: "audio si tehnologie", "confort si echipamente optionale", "electronice si sisteme de asistenta", "performanta", "siguranta", "culoare"

# first make time astype(str) to avoid errors
df["description"] = df["description"].astype(str).replace("nan", "")
df["audio si tehnologie"] = df["audio si tehnologie"].astype(str).replace("nan", "")
df["confort si echipamente optionale"] = df["confort si echipamente optionale"].astype(str).replace("nan", "")
df["electronice si sisteme de asistenta"] = df["electronice si sisteme de asistenta"].astype(str).replace("nan", "")
df["performanta"] = df["performanta"].astype(str).replace("nan", "")
df["siguranta"] = df["siguranta"].astype(str).replace("nan", "")
df["culoare"] = df["culoare"].astype(str).replace("nan", "")

df["description"] = (
    df["description"]
    + " "
    + df["audio si tehnologie"]
    + " "
    + df["confort si echipamente optionale"]
    + " "
    + df["electronice si sisteme de asistenta"]
    + " "
    + df["performanta"]
    + " "
    + df["siguranta"]
    + " "
    + df["culoare"]
)

In [None]:
print(df["description"][0])

# plot the distribution of len of description column
df["description"].astype(str).replace("nan", "").apply(len).hist()
plt.show()

In [None]:
# print average len of description column
print(df["description"].astype(str).replace("nan", "").apply(len).mean())

In [None]:
# replace phone numbers with [TEL] and email addresses with [EMAIL], also replace html tags with [HTML]
import re

df = df["description"].astype(str)


def replace_patterns(text: str):
    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
    html_pattern = r"<.*?>"
    text = re.sub(email_pattern, "[EMAIL]", text)
    text = re.sub(phone_pattern, "[TEL]", text)
    text = re.sub(html_pattern, "[HTML]", text)
    return text


df = df.apply(replace_patterns)

In [None]:
# replace emojis with [EMOJI]
import emoji


def replace_emojis(text: str):
    return emoji.demojize(text, delimiters=("[", "]"))


def replace_repeated_whitespace(text: str):
    return re.sub(r"\s+", " ", text)


df = df.apply(replace_emojis)
df = df.apply(replace_repeated_whitespace)

In [None]:
print(len(df))

In [None]:
# show me a word cloud of the description column

from wordcloud import WordCloud

all_descriptions = " ".join(df)

# Create a word cloud object
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_descriptions)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# create me a histogram of word frequency in the description column

from collections import Counter

# Split the description column into words
words = df[2].split()

# Count the frequency of each word
word_freq = Counter(words)

# Create a dataframe from the word frequency dictionary
word_freq_df = pd.DataFrame(list(word_freq.items()), columns=["Word", "Frequency"])

# Sort the dataframe by frequency
word_freq_df = word_freq_df.sort_values(by="Frequency", ascending=False)

# Plot the histogram
plt.figure(figsize=(10, 5))
plt.bar(word_freq_df["Word"][:10], word_freq_df["Frequency"][:10])
plt.xlabel("Word")
plt.ylabel("Frequency")
plt.title("Top 10 most frequent words in the description column")
plt.show()

In [None]:
df.to_csv("nlp.csv")