In [None]:
import pandas as pd
from urllib.parse import urlparse, unquote
import re
from collections import defaultdict

df = pd.read_csv('gdelt-2024-11-20-000000000000.csv')[['AvgTone', 'SOURCEURL']]

# Grab only 400 rows for testing
df = df.head(50000)

# Extract article names from URLs
def extract_article_name(url):
    parsed = urlparse(url)
    path = unquote(parsed.path)  # Decode URL-encoded characters
    return re.sub(r'[-_]', ' ', path.split('/')[-1])  # Replace '-' and '_' with spaces

df['article_name'] = df['SOURCEURL'].apply(extract_article_name)

# Tokenize article names into words
df['words'] = df['article_name'].apply(lambda x: re.findall(r'\b\w+\b', x.lower()))

# Create a word-document matrix (binary occurrence matrix)
from sklearn.feature_extraction.text import CountVectorizer

# Join words for each article
df['text'] = df['words'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()
word_matrix = vectorizer.fit_transform(df['text'])

# Create a DataFrame from the word matrix
word_df = pd.DataFrame(word_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add avg_tone column to the word matrix DataFrame
word_df['AvgTone'] = df['AvgTone'].values

# Calculate correlations between each word and avg_tone
correlations = {}
for word in word_df.columns[:-1]:  # Exclude avg_tone
    correlations[word] = word_df[word].corr(word_df['AvgTone'])

# Convert correlations to a sorted DataFrame
correlation_df = pd.DataFrame(list(correlations.items()), columns=['word', 'correlation']).sort_values(
    by='correlation', ascending=False
)

# Create a easy to read table for top and bottom 20 words
top_words = correlation_df.head(20)
bottom_words = correlation_df.tail(20)[::-1]

print(top_words)
print(bottom_words)


MemoryError: Unable to allocate 18.5 GiB for an array with shape (50000, 49784) and data type int64