In [129]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import shutil 
import os.path

In [4]:
df_gossipcop_fake = pd.read_json('../FakeNewsNet/data/gossipcop_fake.json')
df_gossipcop_real = pd.read_json('../FakeNewsNet/data/gossipcop_real.json')
df_politifact_fake = pd.read_json('../FakeNewsNet/data/politifact_fake.json')
df_politifact_real = pd.read_json('../FakeNewsNet/data/politifact_real.json')

## Master DF

In [24]:
# Merge Dataset
fake_df = pd.concat([df_gossipcop_fake, df_politifact_fake], ignore_index=True)
real_df = pd.concat([df_gossipcop_real, df_politifact_real], ignore_index=True)

In [35]:
fake_df['label'] = 'false'
real_df['label'] = 'true'

In [36]:
master_df = pd.concat([fake_df, real_df], ignore_index=True)

In [38]:
master_df.to_pickle('./master_fnn.pkl')

## Text Only

In [39]:
text_df = master_df[['text', 'label']]
text_df

Unnamed: 0,text,label
0,On Air with Ryan Seacrest is offering you a ch...,false
1,‘American Idol’ final: How to vote for the sea...,false
2,@ScottDisick @KrisJenner @khloekardashian — LA...,false
3,@foquinha Youngblood - 5 Seconds of Summer \nO...,false
4,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,false
...,...,...
1434570,@NBCNewsPR @MeetThePress @chucktodd @RepAdamSc...,true
1434571,Have you seen our Spotlight report on Initial ...,true
1434572,President Trump’s First Address to a Joint Ses...,true
1434573,@GianMarcoMelosu @SkySportF1 @LewisHamilton @M...,true


In [41]:
text_df.to_pickle('./text_fnn.pkl')

# Slice Dataset

| Size | Amount |
| ---- | ------ | 
| 3xs | 3000 |
| 2xs | 10000 |
| xs | 2000 |
| s | 80000 |
| m | 160000 |
| l | ~500000 |
| xl | all ~1300000 |

In [47]:
size = [3000, 10000, 20000, 80000, 160000, 500000, text_df.shape[0]]
name = ['3xs', '2xs', 'xs', 's', 'm', 'l', 'xl']
output_sizes = dict(zip(name, size))
output_sizes

{'3xs': 3000,
 '2xs': 20000,
 'xs': 40000,
 's': 80000,
 'm': 160000,
 'l': 500000,
 'xl': 1434575}

In [137]:
def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y == "true"),
        "false": len(y) - np.count_nonzero(y == 'true'),
    }

In [145]:
len(text_df[text_df["label"] == 'true'])

956506

In [150]:
def split_data(df, size):
    true_size = min(len(df[df["label"] == 'true']), size)
    false_size = min(len(df[df["label"] == 'false']), size)
    
    df_split = pd.concat(
        [
            df[df["label"] == 'true'].sample(true_size),
            df[df["label"] == 'false'].sample(false_size),
        ]
    )
    return df_split.sample(frac=1).reset_index(drop=True)

In [157]:
for name, size in output_sizes.items():
    split_df = split_data(text_df, size)
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        split_df['text'], split_df["label"], test_size=0.2, random_state=2023
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=2023
    )
    
    data_ds = DatasetDict()

    data_ds['train'] = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1).astype(str)).class_encode_column("label")
    data_ds['validation'] = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1).astype(str)).class_encode_column("label")
    data_ds['test'] = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1).astype(str)).class_encode_column("label")

    data_ds = data_ds.remove_columns(['__index_level_0__'])
    
    path = f'./transformer_datasets/fnn_{name}'
    data_ds.save_to_disk(path)
    
    zip_path = f'{path}.zip'
    archived = shutil.make_archive(path, 'zip', path)
    
    print("File created", archived)
    print({
        "train": get_shape(X_train, y_train),
        "val": get_shape(X_val, y_val),
        "test": get_shape(X_test, y_test),
    })
    print("=================================")

    

                                                                                                

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_3xs.zip
{'train': {'shape': (3840,), 'true': 1917, 'false': 1923}, 'val': {'shape': (960,), 'true': 462, 'false': 498}, 'test': {'shape': (1200,), 'true': 621, 'false': 579}}


                                                                                                  

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_2xs.zip
{'train': {'shape': (25600,), 'true': 12834, 'false': 12766}, 'val': {'shape': (6400,), 'true': 3194, 'false': 3206}, 'test': {'shape': (8000,), 'true': 3972, 'false': 4028}}


                                                                                                  

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_xs.zip
{'train': {'shape': (51200,), 'true': 25635, 'false': 25565}, 'val': {'shape': (12800,), 'true': 6366, 'false': 6434}, 'test': {'shape': (16000,), 'true': 7999, 'false': 8001}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_s.zip
{'train': {'shape': (102400,), 'true': 51168, 'false': 51232}, 'val': {'shape': (25600,), 'true': 12784, 'false': 12816}, 'test': {'shape': (32000,), 'true': 16048, 'false': 15952}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_m.zip
{'train': {'shape': (204800,), 'true': 102120, 'false': 102680}, 'val': {'shape': (51200,), 'true': 25815, 'false': 25385}, 'test': {'shape': (64000,), 'true': 32065, 'false': 31935}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_l.zip
{'train': {'shape': (625964,), 'true': 320125, 'false': 305839}, 'val': {'shape': (156491,), 'true': 80097, 'false': 76394}, 'test': {'shape': (195614,), 'true': 99778, 'false': 95836}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_xl.zip
{'train': {'shape': (918128,), 'true': 612025, 'false': 306103}, 'val': {'shape': (229532,), 'true': 153330, 'false': 76202}, 'test': {'shape': (286915,), 'true': 191151, 'false': 95764}}
