In [14]:
import os
import kagglehub
from rapidfuzz import fuzz, process
from spacy.matcher import PhraseMatcher
import spacy
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import re
from sklearn.model_selection import train_test_split

In [15]:
# Define a list of keywords related to climate change
keywords = [
    "climate", "sustainability", "global warming", "carbon", "greenhouse", "emissions",
    "renewable", "biodiversity", "ecology", "sustainable", "fossil fuels", "energy transition",
    "carbon footprint", "net zero", "solar power", "wind energy", "climate crisis",
    "carbon neutrality", "deforestation", "environmental", "pollution"
]

# load NLP model for phrase matching
nlp = spacy.blank("en")  # blank spaCy model
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(keyword) for keyword in keywords]
phrase_matcher.add("ClimateKeywords", patterns)

model_dev_dict = {'train': pd.DataFrame(), 'test': pd.DataFrame()}

FUZZY_THRESHOLD = 80

def fuzzy_match(text, keywords, threshold=FUZZY_THRESHOLD):
    matched = process.extractOne(text, keywords, scorer=fuzz.partial_ratio)
    return matched[1] >= threshold if matched else False

# 1. Model Development Dataset 

In [16]:
# i) Twitter Misinformation (https://huggingface.co/datasets/roupenminassian/twitter-misinformation)
# (1: misinformation, 0: factual)

desired_train_ratio = 0.8

twitter_misinfo_dict = load_dataset("roupenminassian/twitter-misinformation")

model_dev_dict = {'train': pd.DataFrame(columns=['text', 'label']),
                  'test': pd.DataFrame(columns=['text', 'label'])}

for df_key in ['train', 'test']:
    df = twitter_misinfo_dict[df_key].to_pandas()
    
    # remove extra columns
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')
    
    # normalize text to lowercase for consistent matching
    df['text'] = df['text'].str.lower()
    
    # apply fuzzy and phrase matching
    def is_relevant(text):
        # check for exact phrase match
        doc = nlp(text)
        matches = phrase_matcher(doc)
        if matches:
            return True
        # check for fuzzy match
        return fuzzy_match(text, keywords)

    # filter for climate relevance
    df_filtered = df[df['text'].apply(is_relevant)]
    
    # save the filtered DataFrame into the dictionary
    twitter_misinfo_dict[f'{df_key}_filtered'] = df_filtered

# combine for redistribution
combined_df = pd.concat([twitter_misinfo_dict['train_filtered'], twitter_misinfo_dict['test_filtered']], ignore_index=True)

# rebalance 80:20 split
train_split, test_split = train_test_split(combined_df, test_size=1 - desired_train_ratio, random_state=42)

# update global model_dev_dict
model_dev_dict['train'] = train_split
model_dev_dict['test'] = test_split

# twitter_misinfo_dict
twitter_misinfo_dict['train_filtered'] = train_split.copy()
twitter_misinfo_dict['test_filtered'] = test_split.copy()

print('Rebalanced train shape:', model_dev_dict['train'].shape)
print('Rebalanced test shape:', model_dev_dict['test'].shape)
print('Rebalanced train sample:')
print(model_dev_dict['train'].head(5))
print('Rebalanced test sample:')
print(model_dev_dict['test'].head(5))

for df_key in ['train', 'test']:
    if 'label' in model_dev_dict[df_key].columns:
        class_balance = model_dev_dict[df_key]['label'].value_counts()
        print(f"\nClass Balance ({df_key}):")
        print(class_balance)


Rebalanced train shape: (7598, 2)
Rebalanced test shape: (1900, 2)
Rebalanced train sample:
                                                   text  label
2413  patrick henningsen 21st century wiremuch was m...      1
5680  whole foods is a store that many people love. ...      1
1139  united nations (reuters) - u.s. secretary of s...      0
1897  somebody must have put some truth serum in lit...      1
5715                                                         1
Rebalanced test sample:
                                                   text  label
7788  the nyt allegedly wouldn t run alan dershowitz...      1
6856  washington (reuters) - president barack obama ...      0
5112  look at the #campfire. 88 now confirmed dead, ...      0
2944  climate change, rosenstein, migrant children: ...      0
6836  moscow (reuters) - the association of european...      0

Class Balance (train):
label
0    3932
1    3666
Name: count, dtype: int64

Class Balance (test):
label
0    983
1    917
Name:

In [17]:
# (ii) ‘Reddit Lies Tweets’ (https://www.kaggle.com/datasets/konradb/reddit-lies-tweets)

In [18]:
# (iii) ‘Fake News Classification’ (https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification)
# (1 = misinfo and 0 = factual)
fake_news_classif_dir = kagglehub.dataset_download("saurabhshahane/fake-news-classification")
fake_news_classif_dict = load_dataset(fake_news_classif_dir)

df = fake_news_classif_dict['train'].to_pandas()
df = df.drop(columns=['Unnamed: 0', 'title'], errors='ignore')
df['label'] = df['label'].apply(lambda x: 1 - x)  # Reverse the encoding

df = df[df['text'].notnull()]
df['text'] = df['text'].astype(str).str.lower()

# Filter
df_filtered = df[df['text'].apply(is_relevant)]

# train and test sets
train_split, test_split = train_test_split(df_filtered, test_size=0.2, random_state=42)

# to global dict
model_dev_dict['train'] = pd.concat([model_dev_dict['train'], train_split[['text', 'label']]], ignore_index=True)
model_dev_dict['test'] = pd.concat([model_dev_dict['test'], test_split[['text', 'label']]], ignore_index=True)

#  to local dict
fake_news_classif_dict['train_filtered'] = train_split
fake_news_classif_dict['test_filtered'] = test_split

print('Unfiltered train shape:', fake_news_classif_dict['train'].shape)
print('Filtered train shape:', fake_news_classif_dict['train_filtered'].shape)
print(fake_news_classif_dict['train_filtered'].head(5))

if 'label' in fake_news_classif_dict['train_filtered'].columns:
    class_balance = fake_news_classif_dict['train_filtered']['label'].value_counts()
    print(f"\nClass Balance train:")
    print(class_balance)

Unfiltered train shape: (72134, 4)
Filtered train shape: (12908, 2)
                                                    text  label
44137  protesters, police still clashing over dispute...      0
11344  it is increasingly apparent that the u.s. war ...      1
50620  rights? in the new america you don’t get any r...      0
44705  back in february, analyzing donald trump’s app...      1
25607  bangkok (reuters) - thailand on wednesday mark...      1

Class Balance train:
label
1    6553
0    6355
Name: count, dtype: int64


In [19]:
# (iv) ‘Fake News Net) (https://www.kaggle.com/datasets/mdepak/fakenewsnet)
# 1 = misinfo; 0 = factual

fake_news_net_dir = kagglehub.dataset_download("mdepak/fakenewsnet")

BuzzFeed_dict = {}
Politifact_dict = {}

for source, source_dict in zip(['BuzzFeed', 'PolitiFact'], [BuzzFeed_dict, Politifact_dict]):
    source_df = []
    
    for label in ['fake', 'real']:
        file_path = os.path.join(fake_news_net_dir, f'{source}_{label}_news_content.csv')
        df = pd.read_csv(file_path)
        
        # Drop unnecessary columns and create combined text
        df = df.drop(columns=['id'], errors='ignore')
        df['text'] = df['title'] + ' ' + df['text']
        df['label'] = 1 if label == 'fake' else 0
        
        # Normalize text
        df['text'] = df['text'].str.lower()

        source_df.append(df[['text', 'label']])
    
    # Combine fake and real data
    source_df = pd.concat(source_df, ignore_index=True)
    
    # Filter the data using combined logic
    source_filtered_df = source_df[source_df['text'].apply(is_relevant)]

    # Split into train and test sets
    train_filtered, test_filtered = train_test_split(source_filtered_df, test_size=0.2, random_state=42)

    # Store train and test data in the respective dictionary
    source_dict['train'] = train_filtered
    source_dict['test'] = test_filtered

    # Add to global dict
    model_dev_dict['train'] = pd.concat([model_dev_dict['train'], train_filtered[['text', 'label']]], ignore_index=True)
    model_dev_dict['test'] = pd.concat([model_dev_dict['test'], test_filtered[['text', 'label']]], ignore_index=True)
        
    # Debugging output
    print(f'Unfiltered {source} train shape:', source_df.shape)
    print(f'Filtered {source} train shape:', source_dict['train'].shape)
    print(f'{source} train (filtered) sample:\n', source_dict['train'].head(5))

    if 'label' in source_dict['train'].columns:
        class_balance = source_dict['train']['label'].value_counts()
        print(f"\nClass Balance {source} train:")
        print(class_balance)


Unfiltered BuzzFeed train shape: (182, 2)
Filtered BuzzFeed train shape: (27, 2)
BuzzFeed train (filtered) sample:
                                                   text  label
77   the black sphere with kevin jackson chicago en...      1
172  trump’s puzzling pitch to black voters clevela...      0
60   trump supreme court pick sued by feds for raci...      1
1    charity: clinton foundation distributed “water...      1
15   trump just made a campaign promise so ridiculo...      1

Class Balance BuzzFeed train:
label
0    14
1    13
Name: count, dtype: int64
Unfiltered PolitiFact train shape: (240, 2)
Filtered PolitiFact train shape: (27, 2)
PolitiFact train (filtered) sample:
                                                   text  label
106  louisiana cop claims murdering a 6-year old ch...      1
235  donald trump, germany’s disfavored son – polit...      0
61   former miss universe sizes up melania trump: '...      1
4    monuments to the battle for the new south nine...      1
3

In [20]:
# combined datasets
model_dev_dict = {
    'train_df': model_dev_dict.pop('train'),
    'test_df': model_dev_dict.pop('test'),
}

print("Combined Train Dataset Shape:", model_dev_dict['train_df'].shape)
print("Combined Test Dataset Shape:", model_dev_dict['test_df'].shape)

model_dev_dict['train_df'].to_csv("./data/train_data.csv", index=False)
model_dev_dict['test_df'].to_csv("./data/test_data.csv", index=False)

# Twitter-only datasets
model_dev_TwitterOnly_dict = {}

model_dev_TwitterOnly_dict['train_df'] = twitter_misinfo_dict['train_filtered'][['text', 'label']].reset_index(drop=True)
model_dev_TwitterOnly_dict['test_df'] = twitter_misinfo_dict['test_filtered'][['text', 'label']].reset_index(drop=True)

print("\nTwitter-Only Train Dataset Shape:", model_dev_TwitterOnly_dict['train_df'].shape)
print("Twitter-Only Test Dataset Shape:", model_dev_TwitterOnly_dict['test_df'].shape)

model_dev_TwitterOnly_dict['train_df'].to_csv("./data/twitter_train_data.csv", index=False)
model_dev_TwitterOnly_dict['test_df'].to_csv("./data/twitter_test_data.csv", index=False)

Combined Train Dataset Shape: (20560, 2)
Combined Test Dataset Shape: (5141, 2)

Twitter-Only Train Dataset Shape: (7598, 2)
Twitter-Only Test Dataset Shape: (1900, 2)


# 2) Inference Dataset

In [37]:
# (i) ‘Climate Change Tweets’ (https://www.kaggle.com/datasets/die9origephit/climate-change-tweets)

dir_path = kagglehub.dataset_download("die9origephit/climate-change-tweets")
file_name = os.listdir(dir_path)[0]
path = os.path.join(dir_path, file_name)
df = pd.read_csv(path)
df = df[['Embedded_text']].rename(columns={'Embedded_text': 'text'})

def clean_tweet(text):
    # tweets starting with "Replying to" , "Quote Tweet"
    if re.search(r"^(Replying to|.*Quote Tweet.*)", text, re.IGNORECASE):
        return None
    # URLs, mentions, and other extraneous content after first tweet
    cleaned_text = re.split(r"(@\w+|https?://\S+)", text)[0]
    
    # trailing numbers
    cleaned_text = re.sub(r"(\n\d+|,\d+|\s\d+(\.\d+)?[Kk]?)$", "", cleaned_text).strip()
    
    return cleaned_text.strip()


def clean_tweet(text):
    # if tweet starts with this string at start, drop 
    if re.search(r"^(Replying to|.*Quote Tweet.*)", text, re.IGNORECASE):
        return None
    
    # remove all after the first mention or URL
    cleaned_text = re.split(r"(@\w+|https?://\S+)", text)[0].strip()
    
    # split into lines
    lines = cleaned_text.splitlines()
    
    # Regex for lines that are "only numbers" with commas, decimals, and optional 'K'/'k'
    numeric_pattern = re.compile(r'^\s*\d+(,\d+)*(\.\d+)?[Kk]?\s*$')
    
    # [op off trailing numeric lines
    while lines and numeric_pattern.match(lines[-1]):
        lines.pop()
    
    # reassemble
    return "\n".join(lines).strip()

print("Before cleaning:", df.shape)

df['text'] = df['text'].apply(clean_tweet)
df = df.dropna(subset=['text']).reset_index(drop=True)

inference_tweets_df = df

print(f"Shape of reddit twitter dataset: {inference_tweets_df.shape}")
inference_tweets_df.head(2)

Before cleaning: (9050, 1)
Shape of reddit twitter dataset: (7539, 1)


Unnamed: 0,text
0,The only solution I’ve ever heard the Left pro...
1,Climate change doesn’t cause volcanic eruptions.


In [38]:
# (ii) the ‘Reddit Climate Change' (https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset)


In [39]:
# (ii) the ‘Reddit Climate Change' (https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset)
dir_path = kagglehub.dataset_download("pavellexyr/the-reddit-climate-change-dataset")
file_name = os.listdir(dir_path)[0]
path = os.path.join(dir_path, file_name) 

# it's too big
total_rows = sum(1 for _ in open(path)) - 1
print("total rows:", total_rows)
num_rows_to_load = 10000
df_subset = pd.read_csv(path, nrows=num_rows_to_load)
print("loaded")

df_subset = df_subset[['body']].rename(columns={'body': 'text'})
inference_reddit_df = df_subset
print(f"Shape of reddit inference dataset: {inference_reddit_df.shape}")
inference_reddit_df.head(2)

total rows: 26718281
loaded
Shape of reddit inference dataset: (10000, 1)


Unnamed: 0,text
0,Yeah but what the above commenter is saying is...
1,Any comparison of efficiency between solar and...


In [40]:
# combined inference dataset
inference_df = pd.concat([inference_tweets_df, inference_reddit_df], ignore_index=True)

print(f"Combined inference dataset shape: {inference_df.shape}")
print(inference_df.head(5))

inference_df.to_csv("./data/inference_data.csv", index=False)

# Twitter-only inference dataset
inference_TwitterOnly_df = inference_tweets_df
print(f"\nTwitter-only inference dataset shape: {inference_TwitterOnly_df.shape}")
print(inference_TwitterOnly_df.head(5))

inference_TwitterOnly_df.to_csv("./data/twitter_inference_data.csv")

Combined inference dataset shape: (17539, 1)
                                                text
0  The only solution I’ve ever heard the Left pro...
1   Climate change doesn’t cause volcanic eruptions.
2  Vaccinated tennis ball boy collapses in the te...
3  North America has experienced an average winte...
4  They're gonna do the same with Climate Change ...

Twitter-only inference dataset shape: (7539, 1)
                                                text
0  The only solution I’ve ever heard the Left pro...
1   Climate change doesn’t cause volcanic eruptions.
2  Vaccinated tennis ball boy collapses in the te...
3  North America has experienced an average winte...
4  They're gonna do the same with Climate Change ...
