# Fake News Dataset Loader 📊

This notebook provides loading examples and structural overviews for various fake news and synthetic text datasets:

- LIAR
- ISOT
- COVID-19 Claim News
- GPT-2 Output Dataset
- PHEME (Twitter Rumor Threads)

Each section includes:
- A brief explanation of the dataset's structure
- Python code to load and explore the dataset


In [None]:
# 📘 LIAR Dataset
import pandas as pd

columns = ['id', 'label', 'statement', 'subject', 'speaker',
           'speaker_job', 'state', 'party',
           'barely_true', 'false', 'half_true',
           'mostly_true', 'pants_on_fire', 'context']

liar_train = pd.read_csv('train.tsv', sep='\t', names=columns)
liar_train.head()


In [None]:
# 📘 ISOT Fake News Dataset
fake_df = pd.read_csv('Fake.csv')
real_df = pd.read_csv('True.csv')

fake_df['label'] = 0
real_df['label'] = 1

isot_df = pd.concat([fake_df, real_df], ignore_index=True)
isot_df.head()


In [None]:
# 📘 COVID-19 Claim Dataset
covid_df = pd.read_csv('covid19_claims.csv')
covid_df[['claim', 'label']].head()


In [None]:
# 📘 GPT-2 Output Dataset
gpt2_df = pd.read_csv('/kaggle/input/xl-1542M-k40.test.csv')
webtext_df = pd.read_csv('/kaggle/input/webtext.test.csv')

gpt2_df['label'] = 1
webtext_df['label'] = 0

gpt2_combined = pd.concat([gpt2_df, webtext_df], ignore_index=True)
gpt2_combined.sample(3)


In [None]:
# 📘 PHEME Dataset (Simplified Loader)
import os, json

folds = ['charliehebdo-all-rnr-threads', 'ottawashooting-all-rnr-threads']
texts, usernames = [], []

for fold in folds:
    for root, _, files in os.walk(f'pheme_root/{fold}'):
        for fname in files:
            if fname.endswith('.json'):
                with open(os.path.join(root, fname)) as f:
                    tweet = json.load(f)
                    texts.append(tweet.get('text', ''))
                    usernames.append(tweet.get('user', {}).get('screen_name', ''))

print(f"Loaded {len(texts)} tweets.")
print("Sample tweet:", texts[0])
