In [2]:
import os
import kagglehub
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import re
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a list of keywords related to climate change
keywords = ["climate", "sustainability", "global warming", "carbon", "greenhouse", "emissions", "renewable", "biodiversity", "ecology", "sustainable"]
pattern = "|".join(keywords)

model_dev_dict = {'train': pd.DataFrame(), 'test': pd.DataFrame()}

# 1. Model Development Dataset 

In [3]:
# i) Twitter Misinformation (https://huggingface.co/datasets/roupenminassian/twitter-misinformation)
# (1: misinformation, 0: factual)

twitter_misinfo_dict = load_dataset("roupenminassian/twitter-misinformation")

for df_key in ['train', 'test']:
    df = twitter_misinfo_dict[df_key].to_pandas()
    # remove extra columns
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')
    # filter for climate relevance
    df_filtered = df[df['text'].str.contains(pattern, flags=re.IGNORECASE, na=False, regex=True)]  
    # Save the filtered dataframe back into the dictionary 
    twitter_misinfo_dict[f'{df_key}_filtered'] = df_filtered
    # save to global dict
    model_dev_dict[df_key] = pd.concat([model_dev_dict[df_key], df_filtered[['text', 'label']]], ignore_index=True)

print('unfiltered train shape', twitter_misinfo_dict['train'].shape)
print('filtered train shape', twitter_misinfo_dict['train_filtered'].shape)
print('unfiltered test shape', twitter_misinfo_dict['test'].shape)
print('filtered test shape', twitter_misinfo_dict['test_filtered'].shape)
print(twitter_misinfo_dict['train_filtered'].head(5))

for df_key in ['train', 'test']:
    if 'label' in twitter_misinfo_dict[f'{df_key}_filtered'].columns:
        class_balance = twitter_misinfo_dict[f'{df_key}_filtered']['label'].value_counts()
        print(f"\nClass Balance {df_key}:")
        print(class_balance)
        
#print('dict structure', twitter_misinfo_dict)


unfiltered train shape (92394, 4)
filtered train shape (2406, 2)
unfiltered test shape (10267, 4)
filtered test shape (285, 2)
                                                  text  label
14   WASHINGTON (Reuters) - U.S. President-elect Do...      0
26   Now it beings the government has changed who i...      1
65   As Hurricane Dorian Impacts East Coast, Food L...      0
117  Ben and Jerry, the  ber liberal Vermont ice cr...      1
123  BRASILIA (Reuters) - A congressional committee...      0

Class Balance train:
label
0    1601
1     805
Name: count, dtype: int64

Class Balance test:
label
0    180
1    105
Name: count, dtype: int64


In [4]:
# (ii) ‘Reddit Lies Tweets’ (https://www.kaggle.com/datasets/konradb/reddit-lies-tweets)

In [5]:
# (iii) ‘Fake News Classification’ (https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification)
# (1 = misinfo and 0 = real)

fake_news_classif_dir = kagglehub.dataset_download("saurabhshahane/fake-news-classification")
fake_news_classif_dict = load_dataset(fake_news_classif_dir)

df = fake_news_classif_dict['train'].to_pandas()
df = df.drop(columns=['Unnamed: 0', 'title'], errors='ignore')
# Reverse the encoding of the 'label' column
df['label'] = df['label'].apply(lambda x: 1 - x)
df_filtered = df[df['text'].str.contains(pattern, flags=re.IGNORECASE, na=False, regex=True)]
# Split into train and test sets
train_split, test_split = train_test_split(df_filtered, test_size=0.2, random_state=42)
# add to global dict
model_dev_dict['train'] = pd.concat([model_dev_dict['train'], train_split[['text', 'label']]], ignore_index=True)
model_dev_dict['test'] = pd.concat([model_dev_dict['test'], test_split[['text', 'label']]], ignore_index=True)
# add to local dict
train_split_filtered = train_split
test_split_filtered = test_split
fake_news_classif_dict['train_filtered'] = train_split_filtered
fake_news_classif_dict['test_filtered'] = test_split_filtered

print('unfiltered train shape', fake_news_classif_dict['train'].shape)
print('filtered train shape', fake_news_classif_dict['train_filtered'].shape)

print(fake_news_classif_dict['train_filtered'].head(5))

if 'label' in fake_news_classif_dict['train_filtered'].columns:
    class_balance = fake_news_classif_dict['train_filtered']['label'].value_counts()
    print(f"\nClass Balance {df_key}:")
    print(class_balance)
        
#print('dict structure', fake_news_classif_dict)


unfiltered train shape (72134, 4)
filtered train shape (2981, 2)
                                                    text  label
30053  Michelle Obama fed her husband s feud with Don...      0
51055  WASHINGTON  —   On the fourth floor of the Eis...      1
36478  President Obama, who as a candidate once urged...      0
3966   What an evil bunch of freaks! The agenda is so...      0
64588  President Barack Obama has officially hit the ...      1

Class Balance test:
label
1    1661
0    1320
Name: count, dtype: int64


In [6]:
# (iv) ‘Fake News Net) (https://www.kaggle.com/datasets/mdepak/fakenewsnet)
# 1 = misinfo; 0 = real

fake_news_net_dir = kagglehub.dataset_download("mdepak/fakenewsnet")

BuzzFeed_dict = {}
Politifact_dict = {}

for source, source_dict in zip(['BuzzFeed', 'PolitiFact'], [BuzzFeed_dict, Politifact_dict]):
    source_df = []
    
    for label in ['fake', 'real']:
        file_path = os.path.join(fake_news_net_dir, f'{source}_{label}_news_content.csv')
        df = pd.read_csv(file_path)
        
        # Drop unnecessary columns and create combined text
        df = df.drop(columns=['id'], errors='ignore')
        df['text'] = df['title'] + ' ' + df['text']
        df['label'] = 1 if label == 'fake' else 0

        source_df.append(df[['text', 'label']])
    
    # Combine fake and real data
    source_df = pd.concat(source_df, ignore_index=True)
    
    # Filter the data based on the pattern
    source_filtered_df = source_df[source_df['text'].str.contains(pattern, flags=re.IGNORECASE, na=False, regex=True)]

    # Split into train and test sets
    train_filtered, test_filtered = train_test_split(source_filtered_df, test_size=0.2, random_state=42)

    # Store train and test data in the respective dictionary
    source_dict['train'] = train_filtered
    source_dict['test'] = test_filtered

    # store in global dict
    model_dev_dict['train'] = pd.concat([model_dev_dict['train'], train_filtered[['text', 'label']]], ignore_index=True)
    model_dev_dict['test'] = pd.concat([model_dev_dict['test'], test_filtered[['text', 'label']]], ignore_index=True)
        
    # Debugging output
    print(f'Unfiltered {source} train shape:', source_df.shape)
    print(f'Filtered {source} train shape:', source_dict['train'].shape)
    print(f'{source} train (filtered) sample:\n', source_dict['train'].head(5))

    if 'label' in source_dict['train'].columns:
        class_balance = source_dict['train']['label'].value_counts()
        print(f"\nClass Balance {source} train:")
        print(class_balance)

    #print(f'{source} dict structure:', source_dict)
  



Unfiltered BuzzFeed train shape: (182, 2)
Filtered BuzzFeed train shape: (2, 2)
BuzzFeed train (filtered) sample:
                                                   text  label
142  France becomes the first country to ban plasti...      0
146  Obama weighs in on the debate Obama weighs in ...      0

Class Balance BuzzFeed train:
label
0    2
Name: count, dtype: int64
Unfiltered PolitiFact train shape: (240, 2)
Filtered PolitiFact train shape: (4, 2)
PolitiFact train (filtered) sample:
                                                   text  label
232  Trump’s High-Energy War on American Politics T...      0
112  Trump’s High-Energy War on American Politics T...      1
226  Louisiana Cop Claims Murdering A 6-Year Old Ch...      0
184  BREAKING: North Carolina Drops Anti-LGBT 'Bath...      0

Class Balance PolitiFact train:
label
0    3
1    1
Name: count, dtype: int64


In [7]:
model_dev_dict = {
    'train_df': model_dev_dict.pop('train'),
    'test_df': model_dev_dict.pop('test'),
}

print("Combined Train Dataset Shape:", model_dev_dict['train_df'].shape)
print("Combined Test Dataset Shape:", model_dev_dict['test_df'].shape)

model_dev_dict['train_df'].to_csv("./data/train_data.csv", index=False)
model_dev_dict['test_df'].to_csv("./data/test_data.csv", index=False)

Combined Train Dataset Shape: (5393, 2)
Combined Test Dataset Shape: (1034, 2)


# 2) Inference Dataset

In [3]:
# (i) ‘Climate Change Tweets’ (https://www.kaggle.com/datasets/die9origephit/climate-change-tweets)

dir_path = kagglehub.dataset_download("die9origephit/climate-change-tweets")
file_name = os.listdir(dir_path)[0]
path = os.path.join(dir_path, file_name)
df = pd.read_csv(path)
df = df[['Embedded_text']].rename(columns={'Embedded_text': 'text'})
inference_tweets_df = df

print(f"Shape of reddit twitter dataset: {inference_tweets_df.shape}")
inference_tweets_df.head(2)

Shape of reddit twitter dataset: (9050, 1)


Unnamed: 0,text
0,The only solution I’ve ever heard the Left pro...
1,Climate change doesn’t cause volcanic eruption...


In [9]:
# (ii) the ‘Reddit Climate Change' (https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset)


In [10]:
# (ii) the ‘Reddit Climate Change' (https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset)
dir_path = kagglehub.dataset_download("pavellexyr/the-reddit-climate-change-dataset")
file_name = os.listdir(dir_path)[0]
path = os.path.join(dir_path, file_name) 

# it's too big
total_rows = sum(1 for _ in open(path)) - 1
print("total rows:", total_rows)
num_rows_to_load = 10000
df_subset = pd.read_csv(path, nrows=num_rows_to_load)
print("loaded")

df_subset = df_subset[['body']].rename(columns={'body': 'text'})
inference_reddit_df = df_subset
print(f"Shape of reddit inference dataset: {inference_reddit_df.shape}")
inference_reddit_df.head(2)

total rows: 26718281
loaded
Shape of inference dataset: (10000, 1)


Unnamed: 0,text
0,Yeah but what the above commenter is saying is...
1,Any comparison of efficiency between solar and...


In [11]:
# combine
inference_df = pd.concat([inference_tweets_df, inference_reddit_df], ignore_index=True)

print(f"Combined inference dataset shape: {inference_df.shape}")
print(inference_df.head(5))

# Optional: Save the combined dataset to a CSV file
inference_df.to_csv("/data/inference_data.csv", index=False)

Combined inference dataset shape: (19050, 1)
                                                text
0  The only solution I’ve ever heard the Left pro...
1  Climate change doesn’t cause volcanic eruption...
2  Vaccinated tennis ball boy collapses in the te...
3  North America has experienced an average winte...
4  They're gonna do the same with Climate Change ...
