In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
file_path = '../data/raw/CoAID/05-01-2020/ClaimFakeCOVID-19.csv'


fake = pd.read_csv('../data/raw/CoAID/05-01-2020/ClaimFakeCOVID-19.csv')
real = pd.read_csv('../data/raw/CoAID/05-01-2020/ClaimRealCOVID-19.csv')

fake_titles = fake[['title']].copy()
fake_titles['label'] = 1  # Misinformation

real_titles = real[['title']].copy()
real_titles['label'] = 0  # Factual

dataset = pd.concat([fake_titles, real_titles], axis=0)
dataset = dataset.rename(columns={'title': 'text'})

min_samples = min(len(fake_titles), len(real_titles))
balanced = pd.concat([
    fake_titles.sample(min_samples),
    real_titles.sample(min_samples)
]).sample(frac=1).reset_index(drop=True)

train, temp = train_test_split(balanced, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

train.to_csv('../data/processed/train.csv', index=False)
val.to_csv('../data/processed/val.csv', index=False)
test.to_csv('../data/processed/test.csv', index=False)

In [8]:
print(f"Total samples: {len(balanced)}")
print(f"Class balance:\n{balanced['label'].value_counts()}")
print(f"\nSample fake claim: {balanced[balanced['label'] == 1]['title'].iloc[0]}")
print(f"Sample real claim: {balanced[balanced['label'] == 0]['title'].iloc[0]}")

Total samples: 54
Class balance:
label
1    27
0    27
Name: count, dtype: int64

Sample fake claim: "Home remedies can cure and protect against COVID-19"
Sample real claim: "What are the International Health Regulations?"


In [None]:
import tweepy
from tqdm import tqdm

BEARER_TOKEN = '???????????????????????????????????????'

client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)

fake_df = pd.read_csv('../data/raw/CoAID/05-01-2020/ClaimFakeCOVID-19_tweets.csv', header=None)
real_df = pd.read_csv('../data/raw/CoAID/05-01-2020/ClaimRealCOVID-19_tweets.csv', header=None)

fake_ids = fake_df[1].astype(str).tolist()
real_ids = real_df[1].astype(str).tolist()

def hydrate_tweets(tweet_ids):
    tweets = []
    for i in tqdm(range(0, len(tweet_ids), 100)):
        batch = tweet_ids[i:i+100]
        try:
            batch = [str(tid).strip() for tid in batch if str(tid).strip().isdigit()]
            if not batch:
                continue
            response = client.get_tweets(ids=batch, tweet_fields=['text'])
            if response.data:
                tweets.extend([tweet.text for tweet in response.data])
        except Exception as e:
            print(f"Error fetching batch {i}: {e}")
    return tweets

fake_tweets = hydrate_tweets(fake_ids)
real_tweets = hydrate_tweets(real_ids)

df_fake = pd.DataFrame({'text': fake_tweets, 'label': 1})
df_real = pd.DataFrame({'text': real_tweets, 'label': 0})

df_all = pd.concat([df_fake, df_real]).sample(frac=1).reset_index(drop=True)

df_all.to_csv("../data/processed/combined_raw_tweets.csv", index=False)

 40%|████      | 2/5 [00:00<00:00,  6.35it/s]

Error fetching batch 0: 401 Unauthorized
Unauthorized
Error fetching batch 100: 401 Unauthorized
Unauthorized


 80%|████████  | 4/5 [00:00<00:00,  6.66it/s]

Error fetching batch 200: 401 Unauthorized
Unauthorized
Error fetching batch 300: 401 Unauthorized
Unauthorized


100%|██████████| 5/5 [00:00<00:00,  6.57it/s]


Error fetching batch 400: 401 Unauthorized
Unauthorized


  2%|▏         | 1/64 [00:00<00:09,  6.60it/s]

Error fetching batch 0: 401 Unauthorized
Unauthorized


  3%|▎         | 2/64 [00:00<00:09,  6.22it/s]

Error fetching batch 100: 401 Unauthorized
Unauthorized


  5%|▍         | 3/64 [00:00<00:09,  6.56it/s]

Error fetching batch 200: 401 Unauthorized
Unauthorized


  6%|▋         | 4/64 [00:00<00:09,  6.27it/s]

Error fetching batch 300: 401 Unauthorized
Unauthorized


  8%|▊         | 5/64 [00:00<00:09,  6.46it/s]

Error fetching batch 400: 401 Unauthorized
Unauthorized


  9%|▉         | 6/64 [00:00<00:08,  6.82it/s]

Error fetching batch 500: 401 Unauthorized
Unauthorized


 11%|█         | 7/64 [00:01<00:08,  6.62it/s]

Error fetching batch 600: 401 Unauthorized
Unauthorized


 12%|█▎        | 8/64 [00:01<00:08,  6.85it/s]

Error fetching batch 700: 401 Unauthorized
Unauthorized


 14%|█▍        | 9/64 [00:01<00:08,  6.67it/s]

Error fetching batch 800: 401 Unauthorized
Unauthorized


 16%|█▌        | 10/64 [00:01<00:07,  6.87it/s]

Error fetching batch 900: 401 Unauthorized
Unauthorized


 17%|█▋        | 11/64 [00:01<00:07,  6.86it/s]

Error fetching batch 1000: 401 Unauthorized
Unauthorized


 19%|█▉        | 12/64 [00:01<00:07,  6.95it/s]

Error fetching batch 1100: 401 Unauthorized
Unauthorized


 20%|██        | 13/64 [00:01<00:07,  7.17it/s]

Error fetching batch 1200: 401 Unauthorized
Unauthorized


 22%|██▏       | 14/64 [00:02<00:06,  7.29it/s]

Error fetching batch 1300: 401 Unauthorized
Unauthorized


 23%|██▎       | 15/64 [00:02<00:06,  7.31it/s]

Error fetching batch 1400: 401 Unauthorized
Unauthorized


 25%|██▌       | 16/64 [00:02<00:06,  7.30it/s]

Error fetching batch 1500: 401 Unauthorized
Unauthorized


 27%|██▋       | 17/64 [00:02<00:06,  7.06it/s]

Error fetching batch 1600: 401 Unauthorized
Unauthorized


 28%|██▊       | 18/64 [00:02<00:06,  7.09it/s]

Error fetching batch 1700: 401 Unauthorized
Unauthorized


 30%|██▉       | 19/64 [00:02<00:06,  7.13it/s]

Error fetching batch 1800: 401 Unauthorized
Unauthorized


 31%|███▏      | 20/64 [00:02<00:06,  6.87it/s]

Error fetching batch 1900: 401 Unauthorized
Unauthorized


 33%|███▎      | 21/64 [00:03<00:06,  6.79it/s]

Error fetching batch 2000: 401 Unauthorized
Unauthorized


 34%|███▍      | 22/64 [00:03<00:06,  6.74it/s]

Error fetching batch 2100: 401 Unauthorized
Unauthorized


 36%|███▌      | 23/64 [00:03<00:06,  6.81it/s]

Error fetching batch 2200: 401 Unauthorized
Unauthorized


 38%|███▊      | 24/64 [00:03<00:05,  6.73it/s]

Error fetching batch 2300: 401 Unauthorized
Unauthorized


 39%|███▉      | 25/64 [00:03<00:05,  6.99it/s]

Error fetching batch 2400: 401 Unauthorized
Unauthorized


 41%|████      | 26/64 [00:03<00:05,  7.03it/s]

Error fetching batch 2500: 401 Unauthorized
Unauthorized


 42%|████▏     | 27/64 [00:03<00:05,  7.23it/s]

Error fetching batch 2600: 401 Unauthorized
Unauthorized


 45%|████▌     | 29/64 [00:04<00:05,  6.38it/s]

Error fetching batch 2700: 401 Unauthorized
Unauthorized
Error fetching batch 2800: 401 Unauthorized
Unauthorized


 48%|████▊     | 31/64 [00:04<00:04,  6.67it/s]

Error fetching batch 2900: 401 Unauthorized
Unauthorized
Error fetching batch 3000: 401 Unauthorized
Unauthorized


 52%|█████▏    | 33/64 [00:04<00:04,  6.81it/s]

Error fetching batch 3100: 401 Unauthorized
Unauthorized
Error fetching batch 3200: 401 Unauthorized
Unauthorized


 55%|█████▍    | 35/64 [00:05<00:04,  7.03it/s]

Error fetching batch 3300: 401 Unauthorized
Unauthorized
Error fetching batch 3400: 401 Unauthorized
Unauthorized


 58%|█████▊    | 37/64 [00:05<00:03,  6.98it/s]

Error fetching batch 3500: 401 Unauthorized
Unauthorized
Error fetching batch 3600: 401 Unauthorized
Unauthorized


 61%|██████    | 39/64 [00:05<00:03,  7.16it/s]

Error fetching batch 3700: 401 Unauthorized
Unauthorized
Error fetching batch 3800: 401 Unauthorized
Unauthorized


 64%|██████▍   | 41/64 [00:05<00:03,  7.17it/s]

Error fetching batch 3900: 401 Unauthorized
Unauthorized
Error fetching batch 4000: 401 Unauthorized
Unauthorized


 67%|██████▋   | 43/64 [00:06<00:02,  7.18it/s]

Error fetching batch 4100: 401 Unauthorized
Unauthorized
Error fetching batch 4200: 401 Unauthorized
Unauthorized


 70%|███████   | 45/64 [00:06<00:02,  7.28it/s]

Error fetching batch 4300: 401 Unauthorized
Unauthorized
Error fetching batch 4400: 401 Unauthorized
Unauthorized


 73%|███████▎  | 47/64 [00:06<00:02,  7.08it/s]

Error fetching batch 4500: 401 Unauthorized
Unauthorized
Error fetching batch 4600: 401 Unauthorized
Unauthorized


 77%|███████▋  | 49/64 [00:07<00:02,  7.01it/s]

Error fetching batch 4700: 401 Unauthorized
Unauthorized
Error fetching batch 4800: 401 Unauthorized
Unauthorized


 80%|███████▉  | 51/64 [00:07<00:01,  6.95it/s]

Error fetching batch 4900: 401 Unauthorized
Unauthorized
Error fetching batch 5000: 401 Unauthorized
Unauthorized


 83%|████████▎ | 53/64 [00:07<00:01,  7.18it/s]

Error fetching batch 5100: 401 Unauthorized
Unauthorized
Error fetching batch 5200: 401 Unauthorized
Unauthorized


 86%|████████▌ | 55/64 [00:07<00:01,  7.20it/s]

Error fetching batch 5300: 401 Unauthorized
Unauthorized
Error fetching batch 5400: 401 Unauthorized
Unauthorized


 89%|████████▉ | 57/64 [00:08<00:00,  7.05it/s]

Error fetching batch 5500: 401 Unauthorized
Unauthorized
Error fetching batch 5600: 401 Unauthorized
Unauthorized


 92%|█████████▏| 59/64 [00:08<00:00,  7.21it/s]

Error fetching batch 5700: 401 Unauthorized
Unauthorized
Error fetching batch 5800: 401 Unauthorized
Unauthorized


 95%|█████████▌| 61/64 [00:08<00:00,  6.80it/s]

Error fetching batch 5900: 401 Unauthorized
Unauthorized
Error fetching batch 6000: 401 Unauthorized
Unauthorized


 98%|█████████▊| 63/64 [00:09<00:00,  7.19it/s]

Error fetching batch 6100: 401 Unauthorized
Unauthorized
Error fetching batch 6200: 401 Unauthorized
Unauthorized


100%|██████████| 64/64 [00:09<00:00,  6.97it/s]

Error fetching batch 6300: 401 Unauthorized
Unauthorized



