In [None]:
import pickle
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with open("../output/messages_df.pkl", "rb") as in_file:
    df = pickle.load(in_file)

## 1. Find the sender that sent more messages

In [None]:
# delete malformed messages
df = df[~df['sender'].str.contains(r'\n', regex=True)]
df = df[~df['sender'].str.contains('3.2α')]

In [None]:
all_senders = df["sender"].unique()
all_senders

Number of messages for each sender

In [None]:
sender_dict = {}
for sender in all_senders:
    sender_dict[sender] = len(df[df["sender"] == sender])

sender_dict

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
ax.bar(sender_dict.keys(), sender_dict.values())
ax.set_xticklabels(sender_dict.keys(), rotation=45, ha='right')
ax.set_ylabel("Numero di messaggi")
ax.set_title("Messaggi per mittente")
plt.tight_layout()

plt.show()

In [None]:
msg_per_day = df.groupby(df['timestamp'].dt.date).size()

plt.figure(figsize=(12, 6))
msg_per_day.plot(kind='line')
plt.title("Numero di messaggi per giorno")
plt.xlabel("Data")
plt.ylabel("Numero di messaggi")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Day with max number of messages

max_nmessages = msg_per_day.max()
timestamp = msg_per_day[msg_per_day == max_nmessages]
timestamp

In [None]:
from datetime import date
# Day with min number of messages

min_n_messages = msg_per_day.min()
timestamp = msg_per_day[msg_per_day == min_n_messages]

timestamp

In [None]:
for sender in sender_dict.keys():
    sender_messages = df[df["sender"] == sender]
    msg_per_day = sender_messages.groupby(sender_messages['timestamp'].dt.date).size()
    min_n_messages = msg_per_day.min()
    min_timestamp = msg_per_day[msg_per_day == min_n_messages]
    max_n_messages = msg_per_day.max()
    max_timestamp = msg_per_day[msg_per_day == max_n_messages]

    # print(f"""{sender} MIN = {min_timestamp} MAX = {max_timestamp}\n""")
    

In [None]:
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('italian'))

def preprocess(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # rimuove numeri
    text = re.sub(r'[^\w\s]', '', text)  # rimuove punteggiatura
    tokens = text.split()  # tokenizzazione semplice
    tokens = [word for word in tokens if word not in stop_words]  # rimuove stopwords
    return tokens

tokens_series = df['message'].dropna().apply(preprocess)
all_tokens = [token for sublist in tokens_series for token in sublist if len(token) > 3]

In [None]:
word_freq = Counter(all_tokens)
most_common_words = word_freq.most_common(10)

most_common_words

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
