In [11]:
#creates more_info and headline, main_headline, print_headline
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

full_year = pd.read_csv("../../input-data/temp-data.json")

# Make a copy of the data
more_info = full_year[["_id", "headline", "abstract", "lead_paragraph", "pub_date", "section_name", "subsection_name", "keywords"]].copy()

def extract_headlines(headline_entry):
    # If it's not a dict, try to convert it
    if not isinstance(headline_entry, dict):
        try:
            headline_entry = ast.literal_eval(headline_entry)
        except Exception as e:
            # if conversion fails, use the whole string as both values
            return pd.Series({'main_headline': str(headline_entry), 'print_headline': str(headline_entry)})
    
    main = headline_entry.get('main', '')
    print_head = headline_entry.get('print_headline', '')
    # Use main if print_head is empty or identical to main
    if not print_head or print_head.strip() == '' or print_head == main:
        print_head = main
    return pd.Series({'main_headline': main, 'print_headline': print_head})

# Create the new columns by applying the function
more_info[['main_headline', 'print_headline']] = more_info['headline'].apply(extract_headlines)

# Optional: view the results
more_info.head()

more_info.to_feather("../../data/more_info.feather")

In [3]:
import ast
import json
from collections import Counter
from textblob import TextBlob

# Function to robustly extract keyword values
def extract_keyword_values(keyword_entry):
    if isinstance(keyword_entry, list):
        return [d['value'] for d in keyword_entry if isinstance(d, dict) and 'value' in d]
    elif isinstance(keyword_entry, str):
        try:
            data = json.loads(keyword_entry)
        except Exception:
            try:
                data = ast.literal_eval(keyword_entry)
            except Exception as e:
                print("Error parsing keywords:", e)
                return []
        return [d['value'] for d in data if isinstance(d, dict) and 'value' in d]
    else:
        return []

# Collect all keywords from the dataset
all_keywords = []
for idx, row in more_info.iterrows():
    kws = extract_keyword_values(row['keywords'])
    all_keywords.extend(kws)

# Count the frequency of each keyword
keyword_counts = Counter(all_keywords)

# Filter keywords with more than 5 mentions
filtered_keywords = [kw for kw, count in keyword_counts.items() if count > 5]

# Compute sentiment polarity for each filtered keyword using TextBlob
filtered_keyword_sentiments = {}
for keyword in filtered_keywords:
    polarity = TextBlob(keyword).sentiment.polarity
    filtered_keyword_sentiments[keyword] = polarity

# Sort keywords by the absolute value of polarity (most polarizing first)
sorted_keywords = sorted(filtered_keyword_sentiments.items(), key=lambda x: abs(x[1]), reverse=True)

# Get the top 15 most polarizing keywords
top_15 = sorted_keywords[:15]

print("Top 15 Most Polarizing Keywords (with more than 5 mentions):")
for kw, sentiment in top_15:
    print(f"{kw} (mentions: {keyword_counts[kw]}): {sentiment}")

Top 15 Most Polarizing Keywords (with more than 5 mentions):
Drilling and Boring (mentions: 15): -1.0
Grief (Emotion) (mentions: 81): -0.8
Guantanamo Bay Naval Base (Cuba) (mentions: 87): -0.8
Proud Boys (mentions: 20): 0.8
Great Britain (mentions: 591): 0.8
Hate Crimes (mentions: 70): -0.8
Labour Party (Great Britain) (mentions: 128): 0.8
Great Britain Withdrawal from EU (Brexit) (mentions: 20): 0.8
Hate Speech (mentions: 27): -0.8
Elizabeth II, Queen of Great Britain (mentions: 12): 0.8
Conservative Party (Great Britain) (mentions: 104): 0.8
Make America Great Again (MAGA) Inc (mentions: 13): 0.8
Liberal Democrats (Great Britain) (mentions: 6): 0.8
Great Plains (US) (mentions: 13): 0.8
Great Lakes (mentions: 12): 0.8


In [None]:
from collections import Counter

# Collect all keywords from the DataFrame
all_keywords = []
for idx, row in more_info.iterrows():
    kws = extract_keyword_values(row['keywords'])
    all_keywords.extend(kws)

# Count the frequency of each keyword
keyword_counts = Counter(all_keywords)

# Print the top 20 most common keywords
print("Most Common Keywords:")
for keyword, count in keyword_counts.most_common(500):
    print(f"{keyword}: {count}")