In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('dataset/EDOS 1M.csv')

In [None]:
df.head(20)

# EXAMPLE

In [None]:
print(df.loc[0, 'uttr'])
print('ASSOCIATED EMOTION:', df.loc[0, 'eb+_emot'])
print('CONFIDENCE:', df.loc[0, 'label_confidence'])

# Nan Values

In [None]:
df.isna().sum()

# SIMPLE STATS

In [None]:
n_features = df.shape[1]
n_observations = df.shape[0]
print('DATASET includes {} observations and {} features'.format(n_features, n_observations))

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
(df.loc[:, 'uttr'].apply(len) <= 0).sum()

In [None]:
# how many emotions?
emotions = df.loc[:, 'eb+_emot']

In [None]:
unique_emotions = emotions.unique()
n_emotions = len(unique_emotions)
print('DATASET includes {} emotions'.format(n_emotions))
print(unique_emotions)

In [None]:
emotions_, count_ = np.unique(emotions, return_counts=True)

In [None]:
fig = plt.figure(1, figsize=(12, 6))
plt.bar(emotions_, count_)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
print(f'Least frequent emotion: {emotions_[np.argmin(count_)]} with {np.min(count_)} occurrences')
print(f'Most frequent emotion: {emotions_[np.argmax(count_)]} with {np.max(count_)} occurrences')
balance_df = []
least_freq = np.min(count_)
generator = np.random.default_rng(0)
for emo in emotions_:
    print('deleting for {}'.format(emo))
    group = df[df.loc[:, 'eb+_emot'] == emo].copy(deep=True).reset_index(drop=True)
    group_size = len(group)
    if group_size == least_freq:
        balance_df.append(group)
        continue
    select_random = generator.choice(range(group_size), size=group_size - least_freq, replace=False)
    group.drop(select_random, axis=0, inplace=True)
    print(group.shape)
    balance_df.append(group.copy())
print('DATASET is balanced')

In [None]:
balance_df = pd.concat(balance_df)
balance_df.shape

In [None]:
balance_df.head(1000)

In [None]:
emotions = balance_df.loc[:, 'eb+_emot']
unique_emotions = emotions.unique()
emotions_, count_ = np.unique(emotions, return_counts=True)
fig = plt.figure(1, figsize=(12, 6))
plt.bar(emotions_, count_)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df = balance_df.copy(deep=True).reset_index()
df.to_pickle('./dataset/EDOS_1M_balanced.pkl')

In [None]:
balance_df.describe()

In [None]:
(df.isna().sum() > 0).any()

In [None]:
def parse_emotions():
    for emo in unique_emotions:
        e = df[df.loc[:, 'eb+_emot'] == emo].loc[:, 'label_confidence'].to_numpy(dtype=np.float32)
        yield emo, e


fig = plt.figure(figsize=(24, 10))
e_values = [e[-1] for e in parse_emotions()]
plt.boxplot(e_values)
plt.xticks(range(n_emotions), unique_emotions)
plt.xticks(rotation=90)
plt.show()


In [None]:
list_stats = []
for emo, e in parse_emotions():
    list_stats.append(pd.DataFrame([[np.mean(e), np.std(e)]], columns=['mean', 'std'], index=[emo]))

df_mean_std = pd.concat(list_stats)

In [None]:
df_mean_std = df_mean_std.sort_values(by='mean')

In [None]:
df_mean_std.head(41)

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.bar(df_mean_std.index.values, df_mean_std.loc[:, 'mean'].values)
plt.title('Mean Confidence level per emotion')
plt.axhline(y=1 / n_emotions, color='red', linestyle='--', label=f'1/{n_emotions}')
plt.xticks(rotation=90)
plt.tight_layout()
plt.legend()
plt.show()

In [None]:
# check average length of utterances
mean_length_u = df.loc[:, 'uttr'].apply(len).mean()
print('DATASET: mean length of utterances', mean_length_u)

In [None]:
# check average length of utterances per emotions
def parse_utterances():
    for emo in unique_emotions:
        yield df[df.loc[:, 'eb+_emot'] == emo].loc[:, 'uttr'].apply(len).mean()


u_values = [u for u in parse_utterances()]
fig = plt.figure(figsize=(8, 8))
plt.bar(unique_emotions, u_values)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df_sns = balance_df.copy()
dict_emotion_to_category = {key: value for key, value in zip(unique_emotions, range(len(unique_emotions)))}
dict_category_to_emotion = {key: value for key, value in zip(range(len(unique_emotions)), unique_emotions)}
print(dict_emotion_to_category)
# df_sns = 

In [None]:
df_sns['eb+_emot'] = df_sns['eb+_emot'].apply(lambda x: dict_emotion_to_category[x])

In [None]:
df_sns['uttr'] = df_sns['uttr'].apply(len)

In [None]:
df_sns

In [None]:
import seaborn as sns

sns.pairplot(df_sns.loc[:, ['uttr', 'eb+_emot', 'label_confidence']])
plt.show()

In [None]:
fig = plt.figure(figsize=(14, 8))
t = np.array([i for i in range(n_emotions)])
# Plotting mean label confidence
mm = df_mean_std.loc[unique_emotions, 'mean']
plt.bar(t - 0.2, mm, width=0.4, label='Mean Confidence')

scaled_u_values = (np.array(u_values) - np.min(u_values)) / (
        np.max(u_values) - np.min(u_values))

plt.bar(t + 0.2, scaled_u_values, width=0.4, label='Mean Utterance length [millions]')

plt.title('Comparison of Mean Confidence and Mean Utterance length for Each Emotion')
plt.xticks(range(n_emotions), unique_emotions, rotation=90)  # Set the emotion labels on the x-axis

# Adding legend
plt.legend()

# Display the plot
plt.show()

In [None]:
# fig = plt.figure(figsize=(12, 8))
# X = np.random.choice([0,1], size=50)
# Y = np.random.randint(low=0, high=100, size=50)
# coeff = np.polyfit(X, Y, 1)
# poly = np.poly1d(coeff)
# plt.scatter(X, Y, color='red', label='Dati Osservati')
# plt.plot(X, poly(X), color='orange', label='Rigressione polinomiale')
# plt.xlabel('X (Variabile dicotomica)')
# plt.ylabel('Y (Variabile dipedente)')
# plt.legend()
# plt.show()

# CORRELATIONS

In [None]:
df_length_to_coeff = pd.concat((df.loc[:, 'uttr'].apply(len), df.loc[:, 'label_confidence']), axis=1)

In [None]:
df_length_to_coeff.head()

In [None]:
plt.figure(figsize=(8, 6))
corr_length_to_coeff = df_length_to_coeff.corr('spearman')
sns.heatmap(corr_length_to_coeff, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, vmax=1.0, vmin=-1.0)
plt.title('Correlation Matrix between length of utterance and confidence level')
plt.show()

# CLUSTER

In [None]:
import gensim.downloader as api
model_twitter = api.load("glove-twitter-50")
# model_wiki = api.load("glove-wiki-gigaword-50")

In [None]:
print(f"Vocabulary size twitter model:   {len(model_twitter)}")
# print(f"Vocabulary size wikipedia model: {len(model_wiki)}")

In [None]:
dataset = balance_df.loc[:, 'uttr'].to_numpy()
labels = balance_df.loc[:, 'eb+_emot'].to_numpy()

In [None]:
import re
pattern1 = re.compile('\n')
pattern2 = re.compile('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
test = np.vectorize(lambda x: re.sub(pattern2, '',re.sub(pattern1, ' ', x)))(dataset)
tokens = [x.strip().lower().split(' ') for x in test]


In [None]:
def vectorize(tokens_, use_sum=True):
    global model_twitter
    vectors = np.zeros((len(tokens_),50))
    for i in range(0, len(tokens_)):
        embs = [model_twitter.get_vector(token) for token in tokens_[i] if token in model_twitter]
        if use_sum:
            vectors[i] = np.sum(embs, axis=0)
        else:
            vectors[i] = np.mean(embs, axis=0)
    return vectors
        

In [None]:
embeddings = vectorize(tokens)

In [None]:
embeddings.shape

In [None]:
import umap

In [None]:
reducer = umap.UMAP(n_components=3, n_neighbors=5)

In [None]:
to_plot = reducer.fit_transform(embeddings)

In [None]:
to_plot

In [None]:
# Assuming 'labels', 'to_plot', and 'dict_emotion_to_category' are defined
import plotly.express as px
limit = 30000
colors = sns.color_palette(n_colors=len(labels[:limit]))
colors = [colors[dict_emotion_to_category[x]] for x in labels[:limit]]

# Create a DataFrame
df = pd.DataFrame({'x': to_plot[:limit, 0], 'y': to_plot[:limit, 1], 'z': to_plot[:limit, 2], 'labels': labels[:limit]})

# Create a 3D scatter plot using Plotly Express
fig = px.scatter_3d(df, x='x', y='y', z='z', color='labels')

# Show the plot
fig.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
translated_labels = [dict_emotion_to_category[x] for x in labels]
X_train, X_test, y_train, y_test = train_test_split(to_plot, translated_labels, stratify=translated_labels, test_size=0.3, random_state=12, shuffle=True)

In [None]:
from sklearn.cluster import HDBSCAN
cluster = HDBSCAN(min_cluster_size=600, n_jobs=-1)

cluster = cluster.fit(to_plot)

In [None]:
train_f = cluster.labels_
df_cluster = pd.DataFrame({
    'predicted_label': train_f,
    'label' : translated_labels
})

In [None]:
train_f

In [None]:
cluster_table = {}
for name, group in df_cluster.groupby('predicted_label'):
    uniq_c , count_c = np.unique(group['label'].to_numpy(), return_counts=True)
    index = np.argmax(count_c)
    label_assigned = dict_category_to_emotion[uniq_c[index]]
    print(f'Label {name} assigned {label_assigned}')
    cluster_table[name] = uniq_c[index]

In [None]:
cluster_table

In [None]:
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1_score_ = f1_score(y_true, y_pred, average='weighted')
    print(f'TEST SET: accuracy: {accuracy} ; f1 score: {f1_score_}')

In [None]:
y_pred = cluster.labels_[len(X_train):]
y_pred = [cluster_table[y] for y in y_pred]

In [None]:
compute_metrics(y_test, y_pred)

In [None]:
fig = plt.figure(figsize=(5, 5))
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
y_pred_translated_unique, y_pred_translated_count = np.unique(y_pred_translated, return_counts=True)
plt.bar(y_pred_translated_unique, y_pred_translated_count)
plt.xticks(range(len(y_pred_translated_unique)), y_pred_translated_unique, rotation=90)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_jobs=8, random_state=99)
classifier = classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
compute_metrics(y_test, y_pred)

In [None]:
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
fig = plt.figure(figsize=(10, 10))
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
y_pred_translated_unique, y_pred_translated_count = np.unique(y_pred_translated, return_counts=True)
plt.bar(y_pred_translated_unique, y_pred_translated_count)
plt.xticks(range(len(y_pred_translated_unique)), y_pred_translated_unique, rotation=90)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
classifier = HistGradientBoostingClassifier(random_state=99)
classifier = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
compute_metrics(y_test, y_pred)


In [None]:
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
fig = plt.figure(figsize=(10, 10))
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
y_pred_translated_unique, y_pred_translated_count = np.unique(y_pred_translated, return_counts=True)
plt.bar(y_pred_translated_unique, y_pred_translated_count)
plt.xticks(range(len(y_pred_translated_unique)), y_pred_translated_unique, rotation=90)
plt.tight_layout()
plt.show()

In [None]:
y_pred_translated = [dict_category_to_emotion[y] for y in y_pred]
y_true_translated = [dict_category_to_emotion[y] for y in y_test]

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

fig, ax = plt.subplots(figsize=(20, 20))
# cm = confusion_matrix(np.concatenate(real_labels, axis=0), np.concatenate(out_labels, axis=0))
disp = ConfusionMatrixDisplay.from_predictions(y_true_translated,y_pred_translated,
                                               ax=ax,
                                               xticks_rotation='vertical',
                                               cmap=plt.cm.Blues)

In [None]:
print(classification_report(y_true_translated,y_pred_translated))