In [136]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Data Preprocessing (Tweepfake)

In [137]:
import os
import re

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


ABSOLUTE_PATH = "/content/drive/My Drive/deepfake_tweets"
EXPORT_DIR_PATH = os.path.join(ABSOLUTE_PATH, "data/preprocessed/")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [138]:
# Import split dataset
train_set_path, val_set_path, test_set_path = (
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_train.csv"),
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_val.csv"),
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_test.csv")
)

train_df, val_df, test_df = (
    pd.read_csv(train_set_path, sep=";"),
    pd.read_csv(val_set_path, sep=";"),
    pd.read_csv(test_set_path, sep=";")
)

# Concatenate split sets into a single DataFrame
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


## Data Cleaning

In [139]:
stopwords = set(stopwords.words("english"))

def clean_data(text):
  text = re.sub(r'https?://[^\s\n\r]+', '<URL>', text)  # Replace links with <URL> tag
  text = re.sub(r'#\w+', '<HASHTAG>', text)  # Replace hashtags with <HASHTAG> tag
  text = re.sub(r'@\w+', '<MENTION>', text)  # Replace mentions with <MENTION> tag
  text = " ".join(tkn for tkn in text.split()  # Remove stopwords
                        if tkn.lower() not in stopwords)
  return text

train_df['text'] = train_df['text'].apply(clean_data)
val_df['text'] = val_df['text'].apply(clean_data)
test_df['text'] = test_df['text'].apply(clean_data)

df_tmp = pd.concat([train_df, val_df, test_df], ignore_index=True)
df_tmp.sample(10)

Unnamed: 0,screen_name,text,account.type,class_type
13638,narendramodi,"23rd January 1897, Janakinath Bose wrote diary...",human,human
12607,narendramodi,"Fulfilling promises, furthering people’s aspir...",human,human
19634,kevinhooke,"Every tech product discussion ever. Oh, logbun...",human,human
18760,kevinhooke,Awesome <U+0001F44D> <URL>,human,human
15170,zawarbot,WAIT could pull ur lawyer says ‘chi-polt-tee’ ...,bot,others
14088,JustinTrudeau,"Happy Canada Day! lot celebrate, it’s thanks C...",human,human
1436,zawarbot,u guys we’re pretty boy (korean) forgot abt me...,bot,others
23255,zawarbot,doctor gave like worst thing dont tag twitter ...,bot,others
18274,kevinhooke,<MENTION> <MENTION> think much computing power...,human,human
13192,kevinhookebot,"""The port=20:104250841\r\nand community app ne...",bot,rnn


## Tokenization

In [140]:
tk = TweetTokenizer(
  preserve_case=False,
  strip_handles=True,
  reduce_len=True  # squashes elongated words: cooool -> cool
)

def tokenize_data(text):
  text = tk.tokenize(text)
  return ' '.join(text)

train_df['text_tokens'] = train_df['text'].apply(tk.tokenize)
val_df['text_tokens'] = val_df['text'].apply(tk.tokenize)
test_df['text_tokens'] = test_df['text'].apply(tk.tokenize)

df_tmp = pd.concat([train_df, val_df, test_df], ignore_index=True)
df_tmp.sample(10)

Unnamed: 0,screen_name,text,account.type,class_type,text_tokens
21132,realDonaldTrump,"“Triggered,” great book son, Don. number one <...",human,human,"[“, triggered, ,, ”, great, book, son, ,, don,..."
17516,zawvrk,<MENTION> Im coming hotbox apartment,human,human,"[<mention>, im, coming, hotbox, apartment]"
11369,imranyebot,somali girls alone cause ain’t got the…,bot,others,"[somali, girls, alone, cause, ain, ’, t, got, ..."
23732,dril,"may seem cops fucking dumb, bad IQ tests, etc,...",human,human,"[may, seem, cops, fucking, dumb, ,, bad, iq, t..."
20936,kevinhooke,useful advice probably start understanding ind...,human,human,"[useful, advice, probably, start, understandin..."
2006,imranyebot,talk tone stop travel life would y’all,bot,others,"[talk, tone, stop, travel, life, would, y, ’, ..."
17934,AINarendraModi,commitment trajector President <MENTION>,bot,rnn,"[commitment, trajector, president, <mention>]"
9941,kevinhookebot,"""The public install search install type trying...",bot,rnn,"["", the, public, install, search, install, typ..."
18536,dril,chose fighter <URL>,human,human,"[chose, fighter, <url>]"
1989,imranyebot,game advice,bot,others,"[game, advice]"


## Data Normalisation

In [141]:
stemmer, lemmatizer = PorterStemmer(), WordNetLemmatizer()

def normalise_data(text):
  text = [stemmer.stem(tkn) for tkn in text]
  text = [lemmatizer.lemmatize(tkn) for tkn in text]
  return ' '.join(text)

train_df['text_tokens'] = train_df['text_tokens'].apply(normalise_data)
val_df['text_tokens'] = val_df['text_tokens'].apply(normalise_data)
test_df['text_tokens'] = test_df['text_tokens'].apply(normalise_data)

train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type,text_tokens
0,imranyebot,YEA note GOOD,bot,others,yea note good
1,zawvrk,Listen Charming Man Smiths <URL>,human,human,listen charm man smith <url>
2,zawarbot,wish would seeing hoes worst part,bot,others,wish would see hoe worst part
3,ahadsheriffbot,decade significantly easier schedule like h…,bot,others,decad significantli easier schedul like h …
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn,""" theim class =\ "" alignnon size-ful wp-imag -..."


## Exporting preprocessed dataset

In [142]:
if not os.path.exists(EXPORT_DIR_PATH):
    os.makedirs(EXPORT_DIR_PATH)

train_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_train.csv"), index=False)
val_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_val.csv"), index=False)
test_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_test.csv"), index=False)

print(f"Datasets saved to {EXPORT_DIR_PATH}")

Datasets saved to /content/drive/My Drive/deepfake_tweets/data/preprocessed/
