In [100]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Data Preprocessing (Tweepfake)

In [101]:
import os
import re

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


ABSOLUTE_PATH = "/content/drive/My Drive/deepfake_tweets"
EXPORT_DIR_PATH = os.path.join(ABSOLUTE_PATH, "data/preprocessed/")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
# Import split dataset
train_set_path, val_set_path, test_set_path = (
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_train.csv"),
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_val.csv"),
    os.path.join(ABSOLUTE_PATH, "data/tweepfake_test.csv")
)

train_df, val_df, test_df = (
    pd.read_csv(train_set_path, sep=";"),
    pd.read_csv(val_set_path, sep=";"),
    pd.read_csv(test_set_path, sep=";")
)

# Concatenate split sets into a single DataFrame
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

## Data Cleaning

In [103]:
def clean_data(text):
  text = re.sub(r'https?://[^\s\n\r]+', '<URL>', text) # Replace links with <URL> tag
  text = re.sub(r'#\w+', '<HASHTAG>', text)            # Replace hashtags with <HASHTAG> tag
  text = re.sub(r'@\w+', '<MENTION>', text)            # Replace mentions with <MENTION> tag
  return text

train_df['text'] = train_df['text'].apply(clean_data)
val_df['text'] = val_df['text'].apply(clean_data)
test_df['text'] = test_df['text'].apply(clean_data)

## Tokenization

In [104]:
tk = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def tokenize_data(text):
  text = tk.tokenize(text)
  return ' '.join(text)

train_df['text_tokens'] = train_df['text'].apply(tk.tokenize)
val_df['text_tokens'] = val_df['text'].apply(tk.tokenize)
test_df['text_tokens'] = test_df['text'].apply(tk.tokenize)

# df_tmp = pd.concat([train_df, val_df, test_df], ignore_index=True)
# df_tmp.sample(10)
train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type,text_tokens
0,imranyebot,YEA now that note GOOD,bot,others,"[yea, now, that, note, good]"
1,zawvrk,Listen to This Charming Man by The Smiths <URL>,human,human,"[listen, to, this, charming, man, by, the, smi..."
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others,"[wish, i, can, i, would, be, seeing, other, ho..."
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others,"[the, decade, in, the, significantly, easier, ..."
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn,"["", theim, class, =\, "", alignnone, size-full,..."


## Data Normalisation

In [105]:
stemmer, lemmatizer = PorterStemmer(), WordNetLemmatizer()

def normalise_data(text):
  text = [stemmer.stem(tkn) for tkn in text]
  text = [lemmatizer.lemmatize(tkn) for tkn in text]
  return ' '.join(text)

train_df['text_tokens'] = train_df['text_tokens'].apply(normalise_data)
val_df['text_tokens'] = val_df['text_tokens'].apply(normalise_data)
test_df['text_tokens'] = test_df['text_tokens'].apply(normalise_data)

train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type,text_tokens
0,imranyebot,YEA now that note GOOD,bot,others,yea now that note good
1,zawvrk,Listen to This Charming Man by The Smiths <URL>,human,human,listen to thi charm man by the smith <url>
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others,wish i can i would be see other hoe on the wor...
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others,the decad in the significantli easier schedul ...
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn,""" theim class =\ "" alignnon size-ful wp-imag -..."


## Exporting preprocessed dataset

In [106]:
if not os.path.exists(EXPORT_DIR_PATH):
    os.makedirs(EXPORT_DIR_PATH)

train_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_train.csv"), index=False)
val_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_val.csv"), index=False)
test_df.to_csv(os.path.join(EXPORT_DIR_PATH, "tweepfake_test.csv"), index=False)

print(f"Datasets saved to {EXPORT_DIR_PATH}")

Datasets saved to /content/drive/My Drive/deepfake_tweets/data/preprocessed/
