# WhatsApp Chat Analysis and Classification

This notebook demonstrates a workflow for analyzing and classifying messages from a WhatsApp chat file. The program extracts messages, preprocesses the data, and trains a machine learning model to classify messages based on their authors. The workflow includes the following steps:

1. **Data Extraction**: Parsing the WhatsApp chat file to extract messages, authors, and timestamps.
2. **Data Preprocessing**: Splitting the data into training and validation sets, and converting text and labels into a format suitable for model training.
3. **Tokenization**: Using a pre-trained tokenizer to prepare the text data for input into a BERT-based model.
4. **Model Training**: Fine-tuning a BERT model for sequence classification to predict the author of a given message.
5. **Evaluation**: Assessing the model's performance using accuracy as the evaluation metric.
6. **Prediction**: Classifying new messages to identify their likely authors.

This workflow leverages Hugging Face's Transformers library, along with other Python libraries such as pandas, scikit-learn, and PyTorch.

In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas datasets evaluate

In [None]:
# Import libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import os
from datasets import Dataset
import numpy as np
import evaluate

In [None]:
# Function to extract data from Whatsapp chat file
def extract_df(chat_file):

  # Leer el archivo de texto
  with open(chat_file, 'r', encoding='utf-8') as file:
      lines = file.readlines()

  # Lista para almacenar las filas procesadas
  data = []

  # Expresión regular para detectar mensajes
  pattern = r'^(\d{1,2}/\d{1,2}/\d{2,4}, \d{2}:\d{2}) - ([^:]+): (.+)$'

  # Variables para manejar mensajes que se extienden en varias líneas
  current_date = None
  current_author = None
  current_message = []

  for line in lines:
      line = line.strip()  # Quitar espacios en blanco extra

      # Si la línea coincide con el patrón de mensaje
      match = re.match(pattern, line)
      if match:
          # Guardar el mensaje actual si existe
          if current_date and current_author:
              data.append({
                  'date': current_date,
                  'author': current_author,
                  'message': " ".join(current_message)
              })

          # Extraer nueva fecha, autor y mensaje
          current_date, current_author, message = match.groups()
          current_message = [message]
      else:
          # Línea es una continuación del mensaje
          current_message.append(line)

  # Agregar el último mensaje si existe
  if current_date and current_author:
      data.append({
          'date': current_date,
          'author': current_author,
          'message': " ".join(current_message)
      })


  # Crear DataFrame a partir de la lista de diccionarios
  df = pd.DataFrame(data)

  # Convertir la fecha a tipo datetime para análisis posterior
  df['date'] = pd.to_datetime(df['date'])

  # Mostrar las primeras filas
  print(df[:10])

  return df

In [None]:
# Data preprocessing function
def split_data(data):
  # Dividir en conjunto de entrenamiento y validación
  train_texts, val_texts, train_labels, val_labels = train_test_split(
      data['message'], data['author'], test_size=0.2, random_state=42
  )

  # Mostrar clases únicas
  authors = data['author'].unique()
  author_to_id = {author: i for i, author in enumerate(authors)}
  id_to_author = {i: author for author, i in author_to_id.items()}

  # Convertir las etiquetas a IDs
  train_labels = train_labels.map(author_to_id)
  val_labels = val_labels.map(author_to_id)

  train_dataset = []
  # Iterate over the actual index values of train_texts instead of range(len(train_texts))
  for i in train_texts.index:
    e = ({'text': train_texts[i], 'label': train_labels[i]})
    train_dataset.append(e)

  val_dataset = []
  # Iterate over the actual index values of val_texts instead of range(len(val_texts))
  for i in val_texts.index:
    e = ({'text': val_texts[i], 'label': val_labels[i]})
    # Append to val_dataset instead of train_dataset
    val_dataset.append(e)

  train_dataset = Dataset.from_pandas(pd.DataFrame(train_dataset)) # Convert to Hugging Face Dataset
  val_dataset = Dataset.from_pandas(pd.DataFrame(val_dataset)) # Convert to Hugging Face Dataset

  return train_dataset, val_dataset, id_to_author

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
# Create dataset
chat_file = 'chat.txt'    # CHANGE THIS TO YOUR CHAT FILE
if not os.path.exists(chat_file):
    raise FileNotFoundError(f"Chat file {chat_file} not found.")
data = extract_df(chat_file)
train_dataset, val_dataset, id_to_author = split_data(data)
train_dataset = train_dataset.map(tokenize_function)
val_dataset = val_dataset.map(tokenize_function)

small_train_dataset = train_dataset.shuffle(seed=42).select(range(5000))
small_val_dataset = val_dataset.shuffle(seed=42).select(range(5000))

print("Train dataset size: ", len(train_dataset))

for i in range(10):
  print(small_train_dataset[i])


In [None]:
# Prepare for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) # calculates the index of the maximum value along the last axis (which corresponds to the predicted class) for each prediction. This converts the raw logits into discrete class predictions.
    return metric.compute(predictions=predictions, references=labels) #Calculates the evaluation metrics based on the predicted classes (predictions) and the true classes (labels).


In [None]:
# Configure training arguments
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", report_to="none")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=len(id_to_author))

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= small_train_dataset,
    eval_dataset= small_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# New texts to classify
new_texts = [
    "Hey, how are you doing?",
    "I am not feeling well today.",
    "HAPPY FRIDAY EVERYONE!",
    # Add the texts that you want to classify here
]

inputs = tokenizer(new_texts, padding="max_length", truncation=True, return_tensors="pt")

# Check if CUDA is available and move the model to GPU if it is. The model and the input tensors need to be on the same device.
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Ensure model is in evaluation mode
model.eval()

# Move the inputs to the same device as the model.  The model and the input tensors need to be on the same device.
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get the predicted classes
predictions = torch.argmax(logits, dim=-1)
labels = [id_to_author[prediction.item()] for prediction in predictions]

print(id_to_author)

# Print the reviews with their predicted sentiments
for review, label in zip(new_texts, labels):
    print(f"Review: {review}")
    print(f"Predicted class: {label}\n")