# EDA: Twitter US Airline Sentiment

This notebook summarizes data exploration steps to justify preprocessing and model choices for the sentiment-aware chatbot.

Authors: Paula Llanos López, Samuel Rivero, Sara López Marín (EAFIT)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

pd.set_option('display.max_colwidth', 200)

DATA_PATH = '../data/Tweets.csv'
df = pd.read_csv(DATA_PATH)
df = df[['airline_sentiment','text']].dropna()
df.head()


In [None]:
# Class distribution
ax = df['airline_sentiment'].value_counts(normalize=True).mul(100).plot(kind='bar', color=['tab:red','tab:blue','tab:green'])
plt.title('Class distribution (%)')
plt.ylabel('%')
plt.xlabel('airline_sentiment')
plt.show()

df['len'] = df['text'].astype(str).str.len()
df['tokens'] = df['text'].astype(str).str.split().str.len()
fig, axes = plt.subplots(1,2, figsize=(10,4))
sns.boxplot(data=df, x='airline_sentiment', y='len', ax=axes[0])
sns.boxplot(data=df, x='airline_sentiment', y='tokens', ax=axes[1])
axes[0].set_title('Char length by class')
axes[1].set_title('Token count by class')
plt.tight_layout()
plt.show()


In [None]:
# Simple text cleaning preview
url_re = re.compile(r'http\S+')
mention_re = re.compile(r'@\w+')
hash_re = re.compile(r'#\w+')
newline_re = re.compile(r'[\r\n]+')
space_re = re.compile(r'\s+')

def clean_text(s):
    s = str(s).lower()
    s = url_re.sub('', s)
    s = mention_re.sub('', s)
    s = hash_re.sub('', s)
    s = newline_re.sub(' ', s)
    s = space_re.sub(' ', s).strip()
    return s

sample = df.sample(5, random_state=1)
sample.assign(cleaned=sample['text'].map(clean_text))


## Notes
- The dataset is imbalanced (negative dominates). We will use macro F1 and class_weight='balanced'.
- Cleaning keeps negations and removes URLs, mentions, and hashtags.
- TF–IDF with n-grams (1,2) should capture short phrases relevant for sentiment.
- Train/test split is stratified before vectorization to avoid leakage.
