In [2]:
# %%
import seaborn as sns
import pandas as pd
from pathlib import Path
from loguru import logger
import numpy as np
import warnings
import matplotlib.pyplot as plt
import json
import tomllib
import re
from sklearn.feature_extraction.text import CountVectorizer
from wa_analyzer.model import TextClustering

# Initialize clustering model
clustering = TextClustering()

# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# %%
# --- Load configuration and data ---
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)

datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )

df = pd.read_parquet(datafile)

# --- Filter for the word 'camera' in the 'message' column ---
camera_df = df[df['message'].str.contains('camera', case=False, na=False)]
print(f"Number of messages containing 'camera': {len(camera_df)}")



Number of messages containing 'camera': 118


In [None]:
# %%
# --- Count messages per author ---
top_authors = df['author'].value_counts().head(40).index  # default = 20 of 5
df_top = df[df['author'].isin(top_authors)].copy()
authors = list(np.unique(df_top.author))
print(f"Number of top authors: {len(authors)}")

# %%
# --- Helper to remove URLs ---
def remove_url(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text)

# %%
# --- Create corpus of text chunks per author ---
n = 3000       # chunk size
min_parts = 2  # minimum chunks

corpus = {}
for author in authors:
    subset = df_top[df_top.author == author].reset_index()
    longseq = " ".join(subset.message)
    parts = [longseq[i:i+n] for i in range(0, len(longseq), n)]
    parts = [remove_url(chunk) for chunk in parts]
    parts = [re.sub(" +", " ", chunk) for chunk in parts]
    if len(parts) > min_parts:
        corpus[author] = parts

print(f"\nFinal corpus authors: {list(corpus.keys())}")

# %%
# --- Feature extraction ---
vectorizer = CountVectorizer(analyzer="char", ngram_range=(3, 3))
parts = [part for text in corpus.values() for part in text]
X = vectorizer.fit_transform(parts)
X = np.asarray(X.todense())
print(f"Feature matrix shape: {X.shape}")

# %%
# --- Load author metadata ---
with open("nested_users5.json", "r") as f:
    nested_users = json.load(f)

author_info_df = (
    pd.DataFrame(nested_users)
    .T
    .reset_index()
    .rename(columns={'index': 'author'})
)

# %%
# --- Prepare labels for clustering based on age ---
wa_labels = [k for k, v in corpus.items() for _ in range(len(v))]
age_map = author_info_df.set_index("author")["Older_then_50"].to_dict()

# Encode labels and assign colors
colors = []
for author in wa_labels:
    is_older = age_map.get(author, None)
    
    if is_older:
        colors.append("tab:orange")  # Color for older
    else:
        colors.append("tab:green")    # Color for younger

# %%
# --- Run clustering ---
clustering(
    text=[part for text in corpus.values() for part in text],
    k=500,
    labels=None,  # We'll handle coloring manually
    batch=False,
    method="tSNE" #"PCA"
)

# %%
# --- Overlay points with manual colors ---
# Extract the scatter points from the last clustering plot
sc = plt.gca().collections[0]
offsets = sc.get_offsets()

plt.clf()  # Clear previous plot
plt.scatter(offsets[:, 0], offsets[:, 1], c=colors, s=50, alpha=0.8)

# Legend
legend_elements = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='tab:orange', markersize=10, label='Older than 50'),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='tab:green', markersize=10, label='Younger than 50')
]
plt.legend(handles=legend_elements, title="Age", bbox_to_anchor=(1.05, 1), loc="upper left")

plt.xticks([])
plt.yticks([])
plt.title("WhatsApp messages (highlighting Age)")

plt.show()