In [112]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [113]:
ABSOLUTE_PATH = "/content/drive/My Drive/deepfake_tweets/"
DATASET_DIR_PATH = ABSOLUTE_PATH + "data/"
EXPORT_DIR_PATH = ABSOLUTE_PATH + "data/preprocessed/"

# Data Preprocessing (Tweepfake)

In [114]:
import os
import re
import html

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [115]:
# Import split dataset
train_df, val_df, test_df = (
    pd.read_csv(os.path.join(ABSOLUTE_PATH, "data/tweepfake_train.csv"), sep=";"),
    pd.read_csv(os.path.join(ABSOLUTE_PATH, "data/tweepfake_val.csv"), sep=";"),
    pd.read_csv(os.path.join(ABSOLUTE_PATH, "data/tweepfake_test.csv"), sep=";")
)

# Concatenate split sets into a single DataFrame
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


## Data Cleaning

In [116]:
stopwords = set(stopwords.words("english"))

def clean_data(text):
  # Replace links with <URL> tag
  # text = re.sub(r'https?://[^\s\n\r]+', '<URL>', text)

  text = html.unescape(text)
  text = re.sub(r'<U\+\w+>', '', text)

  # Replace hashtags with <HASHTAG> tag
  text = re.sub(r'#\w+', '<HASHTAG>', text)

  # Replace mentions with <MENTION> tag
  text = re.sub(r'@\w+', '<MENTION>', text)

  # Remove stopwords
  text = " ".join(tkn for tkn in text.split() if tkn.lower() not in stopwords)

  return text

train_df['text'] = train_df['text'].apply(clean_data)
val_df['text'] = val_df['text'].apply(clean_data)
test_df['text'] = test_df['text'].apply(clean_data)

df_tmp = pd.concat([train_df, val_df, test_df], ignore_index=True)
df_tmp.sample(10)

Unnamed: 0,screen_name,text,account.type,class_type
23824,realDonaldTrump,".....trial Senate, Country get back business. ...",human,human
16145,dril_gpt2,"(To Future Son-In-Law) thinking Archer thing, ...",bot,gpt2
2019,zawvrk,<MENTION> seeing hoes low kids moe,human,human
2672,AINarendraModi,film speeding posts people.,bot,rnn
11627,imranye,2020 i’m still simp,human,human
9263,kevinhooke,Choosing right tool job important choosing som...,human,human
8203,AINarendraModi,Dear first meeting India-Atara congratulations...,bot,rnn
15689,kevinhookebot,"""The technolog https://t.co/NhWmyRSoDZ systemd...",bot,rnn
20747,kevinhooke,ship Asteroids hidden plain sight GPS system h...,human,human
25099,zawarbot,claire HEAVEN,bot,others


In [117]:
train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA note GOOD,bot,others
1,zawvrk,Listen Charming Man Smiths https://t.co/r12OIX...,human,human
2,zawarbot,wish would seeing hoes worst part,bot,others
3,ahadsheriffbot,decade significantly easier schedule like h…,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


## Tokenization

In [118]:
tokenizer = TweetTokenizer(
  preserve_case=False,
  strip_handles=True,
  reduce_len=True  # squashes elongated words (cooool -> cool)
)

def tokenize_data(text):
  text = tokenizer.tokenize(text)
  return ' '.join(text)

# train_df['text_tokens'] = train_df['text'].apply(tokenize_data)
# val_df['text_tokens'] = val_df['text'].apply(tokenize_data)
# test_df['text_tokens'] = test_df['text'].apply(tokenize_data)

# df_tmp = pd.concat([train_df, val_df, test_df], ignore_index=True)
# df_tmp.sample(10)

In [119]:
train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA note GOOD,bot,others
1,zawvrk,Listen Charming Man Smiths https://t.co/r12OIX...,human,human
2,zawarbot,wish would seeing hoes worst part,bot,others
3,ahadsheriffbot,decade significantly easier schedule like h…,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


## Data Normalisation

In [120]:
stemmer, lemmatizer = PorterStemmer(), WordNetLemmatizer()

def normalise_data(text):
  text = text.split()
  text = [stemmer.stem(tkn) for tkn in text]
  text = [lemmatizer.lemmatize(tkn) for tkn in text]
  return ' '.join(text)

train_df['text'] = train_df['text'].apply(normalise_data)
val_df['text'] = val_df['text'].apply(normalise_data)
test_df['text'] = test_df['text'].apply(normalise_data)

train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,yea note good,bot,others
1,zawvrk,listen charm man smith https://t.co/r12oixkfuo,human,human
2,zawarbot,wish would see hoe worst part,bot,others
3,ahadsheriffbot,decad significantli easier schedul like h…,bot,others
4,kevinhookebot,"""theim class=\""alignnon size-ful wp-image-6017...",bot,rnn


## Exporting preprocessed dataset

In [121]:
if not os.path.exists(EXPORT_DIR_PATH):
    os.makedirs(EXPORT_DIR_PATH)

train_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_train.csv"), index=False)
val_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_val.csv"), index=False)
test_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_test.csv"), index=False)

print(f"Datasets saved to {EXPORT_DIR_PATH}")

Datasets saved to /content/drive/My Drive/deepfake_tweets/data/preprocessed/
