# Transformer model

ref: https://huggingface.co/course/chapter2/2?fw=pt

## Make Dataset

In [166]:
import pandas as pd
import numpy as np
import datasets
from datasets import Dataset, DatasetDict

In [168]:
data_df = pd.read_pickle('../../../dataset/FakeNewsNet/data/FakeNewsNet.pkl')
# data_df['label_raw'] = data_df['label']
# data_df['label_num'] = data_df['label'].map({'false': 0, 'true': 1})
data_df['label'] = data_df['label'].map({'false': False, 'true': True})
data_df.head(10)

Unnamed: 0,text,processed_text,label
0,On Air with Ryan Seacrest is offering you a ch...,"[air, ryan, seacrest, offer, chance, win, nigh...",False
1,‘American Idol’ final: How to vote for the sea...,"[american, idol, final, vote, season, winner, ...",False
2,@ScottDisick @KrisJenner @khloekardashian — LA...,"[latest, art, shame, revenge, prank, banksy, s...",False
3,@foquinha Youngblood - 5 Seconds of Summer \nO...,"[youngblood, seconds, summer, little, mix, del...",False
4,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,"[kylie, jenner, open, reconciliation, tyga, pr...",False
5,@Khalais1 @ibpqueen @IstantheBadGuy @_luluomar...,"[yes, studio, album, album, consistent, let, k...",False
6,@realDonaldTrump Says the Jesuit of his brothe...,"[say, jesuit, brother, entrench, rome, payday,...",False
7,Kim Kardashian Recalls “Tough Conversation” Wi...,"[kim, kardashian, recalls, tough, conversation...",False
8,"RT @rihanna: RT @RyanSeacrest: ""Nobody really ...","[rt, rt, care, miserable, happy, cynthia, nelm]",False
9,Portia de Rossi: Ellen Divorce Rumors Make Us ...,"[portia, de, rossi, ellen, divorce, rumors, fe...",False


In [169]:
# Shuffle and random pick 10000 each label
data_df_smaller = pd.concat(
    [
        # data_df[data_df["label"] == True].sample(80000),
        # data_df[data_df["label"] == False].sample(80000),
        data_df[data_df["label"] == True],
        data_df[data_df["label"] == False],
    ]
)

# Shuffle the order of the dataset
data_df = data_df_smaller.sample(frac=1).reset_index(drop=True)
data_df

Unnamed: 0,text,processed_text,label
0,"When the Royal Family Really Has a Scandal, Th...","[royal, family, scandal, mess]",True
1,on the last episode of that 70s show idk if i ...,"[episode, idk, go, to, miss]",False
2,You're Doing It Wrong: Kim Kardashian's Clear ...,"[wrong, kim, kardashian, clear, skin, met, gala]",True
3,A Shot at Love With Tila Tequila Star Ashley M...,"[shot, love, tila, tequila, star, ashley, mcne...",True
4,5 Decor Tricks Inspired by Ellen DeGeneres and...,"[decor, tricks, inspire, ellen, degeneres, por...",True
...,...,...,...
1368182,The NEVER TRUMP ARMY is waging war against our...,"[trump, army, wage, war, campaign, mitch, mcco...",True
1368183,Eddie Redmayne and Wife Hannah Make First Publ...,"[eddie, redmayne, wife, hannah, public, appear...",True
1368184,.@anniekarni Lavergne vs US House of Represent...,"[lavergne, vs, house, representatives, demand,...",False
1368185,"TODAY 20TH NOVEMBER, 2018\n10:00am\n#MeetThePr...","[today, november, meetthepress, napospeaks]",True


In [158]:
# f = open('jfnn-xs', 'w')
# print(data_df.to_json(orient='records', lines=True),file=f, flush=False)
# f.close()

In [170]:
# Train test split
from sklearn.model_selection import train_test_split


def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y),
        "false": len(y) - np.count_nonzero(y),
    }


X_train, X_test, y_train, y_test = train_test_split(
    data_df['text'], data_df["label"], test_size=0.2, random_state=2023
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=2023
)

print("train", get_shape(X_train, y_train))
print("val", get_shape(X_val, y_val))
print("test", get_shape(X_test, y_test))


train {'shape': (875639,), 'true': 597033, 'false': 278606}
val {'shape': (218910,), 'true': 148976, 'false': 69934}
test {'shape': (273638,), 'true': 186360, 'false': 87278}


In [171]:
pd.concat([X_train, y_train], axis=1)

Unnamed: 0,text,label
4294,What's Really Going on Between Scott Disick an...,True
189163,@IanBouillion ya but you're way more interesti...,False
49350,*kourtney kardashian voice* literally mom you’...,False
98271,Tonight we are awarding Sustainable Nation Ire...,False
1227617,7/23/18: White House Press Briefing - The Tim...,True
...,...,...
902720,"CNN: Trump asking Congress, not Mexico, to pay...",True
1187158,"New post (Sorry, The Office Fans! Splitting Up...",True
47495,Demi Lovato and Ex Wilmer Valderrama Reunite A...,True
619958,@HouseGOP as over 680 different group marched ...,True


In [172]:
data_ds = DatasetDict()

data_ds['train'] = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1)).class_encode_column("label")
data_ds['validation'] = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1)).class_encode_column("label")
data_ds['test'] = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1)).class_encode_column("label")

data_ds

Stringifying the column:   0%|          | 0/876 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/876 [00:00<?, ?ba/s]

Stringifying the column:   0%|          | 0/219 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/219 [00:00<?, ?ba/s]

Stringifying the column:   0%|          | 0/274 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/274 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 875639
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 218910
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 273638
    })
})

In [173]:
data_ds['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['False', 'True'], id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [174]:
# 3xs: 3000
# 2xs: 20000
# xs: 40000
# s: 80000
# m: 160000
# l: ~500000
# xl: all ~1300000


# output_name = "/sample"
output_name = "/fnn_xl"

data_ds.save_to_disk('data/' + output_name)

Saving the dataset (0/1 shards):   0%|          | 0/875639 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/218910 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/273638 [00:00<?, ? examples/s]

In [None]:
assert(True)

In [None]:
liar_df = pd.read_pickle("../../../dataset/LIAR/data/master.pkl")
liar_df['text'] = liar_df['statement']
liar_df['label_raw'] = liar_df['label']
liar_df['label'] = liar_df['label_raw'].apply(lambda x: 1 if x else 0)

# Train test split
from sklearn.model_selection import train_test_split


def get_shape(X, y):
    return {
        "shape": X.shape,
        "true": np.count_nonzero(y),
        "false": len(y) - np.count_nonzero(y),
    }


X_liar_train, X_liar_test, y_liar_train, y_liar_test = train_test_split(
    liar_df["text"], liar_df["label"], test_size=0.2, random_state=2023
)
X_liar_train, X_liar_val, y_liar_train, y_liar_val = train_test_split(
    X_liar_train, y_liar_train, test_size=0.2, random_state=2023
)

print("train", get_shape(X_liar_train, y_liar_train))
print("val", get_shape(X_liar_val, y_liar_val))
print("test", get_shape(X_liar_test, y_liar_test))

train {'shape': (8185,), 'true': 4560, 'false': 3625}
val {'shape': (2047,), 'true': 1149, 'false': 898}
test {'shape': (2559,), 'true': 1425, 'false': 1134}


In [None]:
liar_df.head()

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,statement_processed,text,label_raw
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,say Annies List political group support trimes...,Says the Annies List political group supports ...,False
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,decline coal start start natural gas take star...,When did the decline of coal start? It started...,True
2,324.json,1,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Hillary Clinton agree John McCain vote George ...,"Hillary Clinton agrees with John McCain ""by vo...",True
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,health care reform legislation likely mandate ...,Health care reform legislation is likely to ma...,False
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,economic turnaround start end term,The economic turnaround started at the end of ...,True


In [None]:
liar_ds = DatasetDict()

liar_ds['train'] = Dataset.from_pandas(pd.concat([X_liar_train, y_liar_train], axis=1))
liar_ds['validation'] = Dataset.from_pandas(pd.concat([X_liar_val, y_liar_val], axis=1))
liar_ds['test'] = Dataset.from_pandas(pd.concat([X_liar_test, y_liar_test], axis=1))

liar_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 8185
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2047
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2559
    })
})

In [None]:
liar_ds.save_to_disk('liar_ds')

Saving the dataset (0/1 shards):   0%|          | 0/8185 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2047 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2559 [00:00<?, ? examples/s]

---