Exploratory Data Analysis (EDA)

In [None]:
CLEAN_TEXT_COLUMN='article'
SUMMARY_COLUMN='highlights'

In [None]:
import ssl
import certifi

ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

from collections import Counter

import matplotlib.pyplot as plt
import nltk
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud

from to_delete.event_extraction import EventExtractor

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

sns.set(style="whitegrid")


In [None]:
df = pd.read_csv("../data/processed/train.csv")
df.head()

In [None]:
df["char_len"] = df[CLEAN_TEXT_COLUMN].str.len()
df["word_len"] = df[CLEAN_TEXT_COLUMN].str.split().str.len()

plt.figure(figsize=(10, 5))
sns.histplot(df["word_len"], bins=40, color="purple")
plt.title("Distribution of Article Word Lengths")
plt.xlabel("Words per article")
plt.ylabel("Frequency")
plt.show()

In [None]:
text = " ".join(df["clean_text"].tolist())
wordcloud = WordCloud(width=1200, height=600, background_color="white").generate(text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Wordcloud of Corpus", fontsize=20)
plt.show()


In [None]:
from nltk.tokenize import word_tokenize

words = word_tokenize(text.lower())

stopwords = set(nltk.corpus.stopwords.words("english"))
filtered = [w for w in words if w.isalnum() and w not in stopwords]

freq = Counter(filtered).most_common(30)

freq_df = pd.DataFrame(freq, columns=["word", "count"])

plt.figure(figsize=(10, 6))
sns.barplot(data=freq_df, x="count", y="word", palette="viridis")
plt.title("Top 30 Most Frequent Words")
plt.show()


In [None]:
from nltk.util import ngrams

bigrams = Counter(ngrams(filtered, 2)).most_common(20)
bigram_df = pd.DataFrame(bigrams, columns=["bigram", "count"])

bigram_df["bigram"] = bigram_df["bigram"].apply(lambda x: " ".join(x))

plt.figure(figsize=(10, 6))
sns.barplot(data=bigram_df, x="count", y="hue", palette="magma")
plt.title("Top 20 Bigrams")
plt.show()


In [None]:
extractor = EventExtractor()

sample_entities = []
for i in range(5):
    ents = extractor.extract_entities(df.loc[i, "clean_text"])
    sample_entities.append(ents)

sample_entities


In [None]:
events = []
for text in df["clean_text"].head(200):  # just first 200 for speed
    trigger, event_type = extractor.find_event_trigger(text)
    events.append(event_type if event_type else "unknown")

event_counts = Counter(events)

plt.figure(figsize=(8, 5))
sns.barplot(x=list(event_counts.keys()), y=list(event_counts.values()), palette="coolwarm")
plt.title("Event Type Distribution")
plt.xlabel("Event Type")
plt.ylabel("Count")
plt.show()