# FakeNewsNet Transformation

Transform data into target shape

In [6]:
import pandas as pd

In [7]:
gossipcop_fake_df_raw = pd.read_json('./data/gossipcop_fake.json')
gossipcop_real_df_raw = pd.read_json('./data/gossipcop_real.json')
politifact_fake_df_raw = pd.read_json('./data/politifact_fake.json')
politifact_real_df_raw = pd.read_json('./data/politifact_real.json')

In [8]:
# Concat dataframe and add label
false_df_raw = pd.concat([gossipcop_fake_df_raw, politifact_fake_df_raw])
false_df_raw['label'] = 'false'
true_df_raw = pd.concat([gossipcop_real_df_raw, politifact_real_df_raw])
true_df_raw['label'] = 'true'

master_df_raw = pd.concat([false_df_raw, true_df_raw])
print(master_df_raw.shape)

(1434575, 17)


In [9]:
# Filter non-en language
master_df_raw = master_df_raw[master_df_raw['lang'] == 'en']

print(master_df_raw.lang.unique())

master_df_raw.head()

['en']


Unnamed: 0,lang,id,entities,public_metrics,context_annotations,possibly_sensitive,created_at,author_id,text,conversation_id,edit_history_tweet_ids,reply_settings,in_reply_to_user_id,referenced_tweets,geo,withheld,label
0,en,1029123395739414528,"{'annotations': [{'start': 12, 'end': 24, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",False,2018-08-13 21:51:52,1012203358512443392,On Air with Ryan Seacrest is offering you a ch...,1029123395739414528,[1029123395739414529],everyone,,,,,False
1,en,998353516434518016,"{'hashtags': [{'start': 110, 'end': 116, 'tag'...","{'retweet_count': 1, 'reply_count': 1, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-05-21 00:03:21,829904857305927680,‘American Idol’ final: How to vote for the sea...,998353516434518016,[998353516434518016],everyone,,,,,False
2,en,1051158211208736768,"{'annotations': [{'start': 82, 'end': 87, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",False,2018-10-13 17:10:21,31259532,@ScottDisick @KrisJenner @khloekardashian — LA...,1051158211208736768,[1051158211208736768],everyone,101928415.0,"[{'type': 'quoted', 'id': '1050443040668770304'}]",,,False
3,en,1011368336804937728,"{'annotations': [{'start': 10, 'end': 19, 'pro...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",False,2018-06-25 21:59:36,194346085,@foquinha Youngblood - 5 Seconds of Summer \nO...,1011289623119716352,[1011368336804937728],everyone,18141369.0,"[{'type': 'replied_to', 'id': '101128962311971...",,,False
4,en,954584822474838016,"{'annotations': [{'start': 0, 'end': 11, 'prob...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","[{'domain': {'id': '10', 'name': 'Person', 'de...",True,2018-01-20 05:22:11,31469390,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,954584822474838016,[954584822474838016],everyone,,,,,False


## Preprocessing

https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing

In [10]:
# rmeove url
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')

master_df_raw['text'] = master_df_raw['text'].apply(lambda x: url_pattern.sub(r'', x))

In [11]:
# remove emoji
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b

emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)

master_df_raw["text"] = master_df_raw["text"].apply(lambda x: emoji_pattern.sub(r"", x))


In [12]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [13]:
# Tokenization
def get_tokenized_text(doc):
    return list(
        filter(
            lambda x: str(x) != "",
            [
                token.lemma_.lower() if not token.is_stop and token.is_alpha else ""
                for token in doc
            ],
        )
    )

In [14]:
from tqdm import tqdm

texts = []
for doc in tqdm(nlp.pipe(master_df_raw["text"], n_process=-1), total=master_df_raw["text"].shape[0]):
    texts.append(get_tokenized_text(doc))


100%|██████████| 1368187/1368187 [07:11<00:00, 3171.98it/s]


In [15]:
master_df_raw["processed_text"] = pd.Series(texts)


In [16]:
master_df = master_df_raw[['text', 'processed_text', 'label']]
master_df.head()

Unnamed: 0,text,processed_text,label
0,On Air with Ryan Seacrest is offering you a ch...,"[air, ryan, seacrest, offer, chance, win, nigh...",False
1,‘American Idol’ final: How to vote for the sea...,"[american, idol, final, vote, season, winner, ...",False
2,@ScottDisick @KrisJenner @khloekardashian — LA...,"[latest, art, shame, revenge, prank, banksy, s...",False
3,@foquinha Youngblood - 5 Seconds of Summer \nO...,"[youngblood, seconds, summer, little, mix, del...",False
4,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,"[kylie, jenner, open, reconciliation, tyga, pr...",False


In [17]:
master_df.to_pickle('./data/FakeNewsNet.pkl')