In [1]:
from transformers import pipeline

In [2]:
extractor = pipeline(model = "yanekyuk/bert-keyword-extractor")

In [3]:
# Read the csv file
import pandas as pd

df = pd.read_csv("ClimateChange_UK.csv")

In [4]:
# clean the data
df['body'] = df['body'].fillna('0').astype(str)

In [5]:
# Define a keyword extraction function
def extract_keywords(text):
    try:
        return extractor(text)
    except Exception as e:
        print(f"Error extracting keywords for text: {text} with error {e}")

In [6]:
# Use the function to extract keyword
df['keywords'] = df['body'].apply(extract_keywords)

In [7]:
# Define a formatting function
def format_keywords(hf_output):
    return [(entry['word'], entry['score']) for entry in hf_output]

In [8]:
df['formatted_keywords'] = df['keywords'].apply(format_keywords)

In [9]:
# To solve the prblem brought by subword tokenization
def merge_subwords(keywords):
    merged_keywords = []
    previous_keyword = None
    
    for word, score in keywords:
        if word.startswith('##'):
            if previous_keyword:
                # Remove the "##" and combine with previous word
                previous_keyword = (previous_keyword[0] + word.replace('##', ''), max(previous_keyword[1], score))
        else:
            if previous_keyword:
                # Add the keyword we combined to the list
                merged_keywords.append(previous_keyword)
            # Update the previous_keyword with current word
            previous_keyword = (word, score)
    
    # Make sure the last word will be added
    if previous_keyword:
        merged_keywords.append(previous_keyword)
        
    return merged_keywords

In [10]:
df['merged_keywords'] = df['formatted_keywords'].apply(merge_subwords)

In [11]:
all_keywords = df['merged_keywords'].tolist()

In [12]:
from itertools import chain

flat_list = list(chain(*all_keywords)) 

In [21]:


def remove_duplicates(keywords):
    keyword_dict = {}
    for word, score in keywords:
        if word not in keyword_dict or keyword_dict[word] < score:
            keyword_dict[word] = score
    # Sort keywords based on score
    return sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)

unique_keywords = remove_duplicates(flat_list)
top_50_keywords = unique_keywords[:50]


In [15]:
'''
import re
from collections import Counter

# 定义停用词列表，可以根据需要添加更多停用词
stop_words = set([
    'a', 'an', 'and', 'the', 'of', 'in', 'to', 'is', 'it', 'that', 'on', 'for', 
    'with', 'as', 'by', 'at', 'from', 'this', 'be', 'or', 'which', 'but', 'are', 
    'was', 'were', 'not', 'have', 'has', 'had', 'will', 'would', 'can', 'could', 
    'should', 'shall', 'may', 'might', 'must', 'do', 'does', 'did','w','T'
])

def is_valid_word(word):
    # 只保留包含字母和数字的单词且不在停用词列表中的单词
    return re.match(r'^[A-Za-z0-9]+$', word) is not None and word.lower() not in stop_words

def count_frequency(keywords):
    keyword_counter = Counter(word for word, _ in keywords if is_valid_word(word))
    return sorted(keyword_counter.items(), key=lambda x: x[1], reverse=True)

unique_keywords = count_frequency(flat_list)
top_50_keywords = unique_keywords[:50]
'''

In [22]:
top_50_keywords

[('Anonymous', 0.9999844),
 ('Judas', 0.99998367),
 ('Brexit', 0.9999825),
 ('Axel', 0.9999821),
 ('Cambridge', 0.9999819),
 ('EastEnders', 0.99998164),
 ('brexshit', 0.9999815),
 ('Iron', 0.9999815),
 ('granny', 0.9999814),
 ('Private', 0.99998116),
 ('Evil', 0.99998),
 ('Vodafone', 0.9999796),
 ('Euronews', 0.99997926),
 ('permission', 0.99997914),
 ('Butthole', 0.99997914),
 ('Shrek', 0.999979),
 ('Hydrogen', 0.9999784),
 ('breshit', 0.9999783),
 ('Reddit', 0.99997747),
 ('Vote', 0.99997735),
 ('Nigel', 0.9999769),
 ('Hide', 0.99997675),
 ('Firefox', 0.9999765),
 ('Eurotunnel', 0.99997616),
 ('Empire', 0.9999759),
 ('Glorious', 0.9999759),
 ('Brexitspeak', 0.9999759),
 ('Brawndo', 0.9999758),
 ('Politico', 0.9999757),
 ('McVey', 0.9999757),
 ('Fast', 0.99997556),
 ('Stockholm', 0.99997556),
 ('Next', 0.99997556),
 ('Sinn', 0.9999753),
 ('Verhofstadt', 0.9999753),
 ('Unilever', 0.9999753),
 ('fifth', 0.9999753),
 ('Crocodile', 0.9999751),
 ('Rafael', 0.9999751),
 ('Brexir', 0.9999749

In [23]:
import os

output_path = 'Keywords_Brexit[Score].csv'
if not os.path.exists(output_path):
    df_keywords = pd.DataFrame(top_50_keywords, columns=['Keyword', 'Score'])
    df_keywords.to_csv(output_path, index=False)