In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import shutil 
import os.path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df_gossipcop_fake = pd.read_json('../FakeNewsNet/data/gossipcop_fake.json')
df_gossipcop_real = pd.read_json('../FakeNewsNet/data/gossipcop_real.json')
df_politifact_fake = pd.read_json('../FakeNewsNet/data/politifact_fake.json')
df_politifact_real = pd.read_json('../FakeNewsNet/data/politifact_real.json')

## Master DF

In [24]:
# Merge Dataset
fake_df = pd.concat([df_gossipcop_fake, df_politifact_fake], ignore_index=True)
real_df = pd.concat([df_gossipcop_real, df_politifact_real], ignore_index=True)

In [35]:
fake_df['label'] = 'false'
real_df['label'] = 'true'

In [36]:
master_df = pd.concat([fake_df, real_df], ignore_index=True)

In [38]:
master_df.to_pickle('./master_fnn.pkl')

## Text Only

In [2]:
master_df = pd.read_pickle('./master_fnn.pkl')

In [12]:
text_df = master_df[master_df['lang'] == 'en']
text_df = text_df[['text', 'label']]
text_df['text'] = text_df['text'].astype(str)
text_df['label'] = text_df['label'].astype(str)

In [13]:
text_df.to_pickle('./text_fnn.pkl')

# Slice Dataset

| Size | Amount |
| ---- | ------ | 
| 3xs | 3000 |
| 2xs | 10000 |
| xs | 2000 |
| s | 80000 |
| m | 160000 |
| l | ~500000 |
| xl | all ~1300000 |

In [14]:
size = [3000, 10000, 20000, 80000, 160000, 500000, text_df.shape[0]]
name = ['3xs', '2xs', 'xs', 's', 'm', 'l', 'xl']
output_sizes = dict(zip(name, size))
output_sizes

{'3xs': 3000,
 '2xs': 10000,
 'xs': 20000,
 's': 80000,
 'm': 160000,
 'l': 500000,
 'xl': 1368187}

In [15]:
def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y == "true"),
        "false": len(y) - np.count_nonzero(y == 'true'),
    }

In [16]:
len(text_df[text_df["label"] == 'true'])

932369

In [17]:
def split_data(df, size):
    true_size = min(len(df[df["label"] == 'true']), size)
    false_size = min(len(df[df["label"] == 'false']), size)
    
    df_split = pd.concat(
        [
            df[df["label"] == 'true'].sample(true_size),
            df[df["label"] == 'false'].sample(false_size),
        ]
    )
    return df_split.sample(frac=1).reset_index(drop=True)

In [18]:
for name, size in output_sizes.items():
    split_df = split_data(text_df, size)
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        split_df['text'], split_df["label"], test_size=0.2, random_state=2023
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=2023
    )
    
    data_ds = DatasetDict()

    data_ds['train'] = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1).astype(str)).class_encode_column("label")
    data_ds['validation'] = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1).astype(str)).class_encode_column("label")
    data_ds['test'] = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1).astype(str)).class_encode_column("label")

    data_ds = data_ds.remove_columns(['__index_level_0__'])
    
    path = f'./transformer_datasets/fnn_{name}'
    data_ds.save_to_disk(path)
    
    zip_path = f'{path}.zip'
    archived = shutil.make_archive(path, 'zip', path)
    
    print("File created", archived)
    print({
        "train": get_shape(X_train, y_train),
        "val": get_shape(X_val, y_val),
        "test": get_shape(X_test, y_test),
    })
    print("=================================")

    

                                                                                                

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_3xs.zip
{'train': {'shape': (3840,), 'true': 1897, 'false': 1943}, 'val': {'shape': (960,), 'true': 484, 'false': 476}, 'test': {'shape': (1200,), 'true': 619, 'false': 581}}


                                                                                                  

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_2xs.zip
{'train': {'shape': (12800,), 'true': 6425, 'false': 6375}, 'val': {'shape': (3200,), 'true': 1576, 'false': 1624}, 'test': {'shape': (4000,), 'true': 1999, 'false': 2001}}


                                                                                                  

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_xs.zip
{'train': {'shape': (25600,), 'true': 12769, 'false': 12831}, 'val': {'shape': (6400,), 'true': 3227, 'false': 3173}, 'test': {'shape': (8000,), 'true': 4004, 'false': 3996}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_s.zip
{'train': {'shape': (102400,), 'true': 51105, 'false': 51295}, 'val': {'shape': (25600,), 'true': 12847, 'false': 12753}, 'test': {'shape': (32000,), 'true': 16048, 'false': 15952}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_m.zip
{'train': {'shape': (204800,), 'true': 102510, 'false': 102290}, 'val': {'shape': (51200,), 'true': 25503, 'false': 25697}, 'test': {'shape': (64000,), 'true': 31987, 'false': 32013}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_l.zip
{'train': {'shape': (598923,), 'true': 319753, 'false': 279170}, 'val': {'shape': (149731,), 'true': 80129, 'false': 69602}, 'test': {'shape': (187164,), 'true': 100118, 'false': 87046}}


                                                                                                    

File created /home/kooler/dev/sw/sns-fake-content/dataset/processed_data/transformer_datasets/fnn_xl.zip
{'train': {'shape': (875639,), 'true': 596502, 'false': 279137}, 'val': {'shape': (218910,), 'true': 149458, 'false': 69452}, 'test': {'shape': (273638,), 'true': 186409, 'false': 87229}}


In [22]:
from preprocess import text_preprocess
text_df['processed_text'] = text_preprocess(text_df['text'])

100%|██████████| 1368187/1368187 [08:47<00:00, 2594.58it/s]


In [23]:
# Store CSV
for name, size in output_sizes.items():
    split_df = split_data(text_df, size)
    
    split_df.to_csv(f'./datasets/fnn_{name}.csv', index=False)
    print("File created", f'./datasets/fnn_{name}.csv')
    
    split_df.to_parquet(f'./datasets/fnn_{name}.parquet.gzip', index=False)
    print("File created", f'./datasets/fnn_{name}.parquet.gzip')
    
    split_df.to_pickle(f'./datasets/fnn_{name}.pkl')
    print("File created", f'./datasets/fnn_{name}.pkl')
    
    print({'shape': split_df.shape})
    print("=================================")
    

File created ./datasets/fnn_3xs.csv
File created ./datasets/fnn_3xs.parquet.gzip
File created ./datasets/fnn_3xs.pkl
{'shape': (6000, 3)}
File created ./datasets/fnn_2xs.csv
File created ./datasets/fnn_2xs.parquet.gzip
File created ./datasets/fnn_2xs.pkl
{'shape': (20000, 3)}
File created ./datasets/fnn_xs.csv
File created ./datasets/fnn_xs.parquet.gzip
File created ./datasets/fnn_xs.pkl
{'shape': (40000, 3)}
File created ./datasets/fnn_s.csv
File created ./datasets/fnn_s.parquet.gzip
File created ./datasets/fnn_s.pkl
{'shape': (160000, 3)}
File created ./datasets/fnn_m.csv
File created ./datasets/fnn_m.parquet.gzip
File created ./datasets/fnn_m.pkl
{'shape': (320000, 3)}
File created ./datasets/fnn_l.csv
File created ./datasets/fnn_l.parquet.gzip
File created ./datasets/fnn_l.pkl
{'shape': (935818, 3)}
File created ./datasets/fnn_xl.csv
File created ./datasets/fnn_xl.parquet.gzip
File created ./datasets/fnn_xl.pkl
{'shape': (1368187, 3)}
