In [16]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [17]:
ABSOLUTE_PATH = "/content/drive/My Drive/deepfake_tweets"
RAW_DATASET_DIR = ABSOLUTE_PATH + "/data"
EXPORT_DATASET_DIR = ABSOLUTE_PATH + "/data/preprocessed"

# Data Preprocessing (Tweepfake)

In [18]:
import os
import re
import html
import string

import pandas as pd
# import nltk

In [19]:
# Import splits of the dataset (train, val, test)
train_df, val_df, test_df = (
    pd.read_csv(os.path.join(RAW_DATASET_DIR, "tweepfake_train.csv"), sep=";"),
    pd.read_csv(os.path.join(RAW_DATASET_DIR, "tweepfake_val.csv"), sep=";"),
    pd.read_csv(os.path.join(RAW_DATASET_DIR, "tweepfake_test.csv"), sep=";")
)

## Data Cleaning

In [20]:
def clean_data(text):
  # Remove HTML tags
  text = html.unescape(text)
  text = re.sub(r'<U\+\w+>', '', text)

  # Replace links with <URL> tag
  text = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', text)

  # Replace mentions with <MENTION> tag
  text = re.sub(r'@\w+', '<MENTION>', text)

  # Renive leading hashtag sign
  text = re.sub(r'#([^\s]+)', r'\1', text)

  # Normalise whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  return text

train_df['text'] = train_df['text'].apply(clean_data)
val_df['text'] = val_df['text'].apply(clean_data)
test_df['text'] = test_df['text'].apply(clean_data)

train_df.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths <URL>,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn


In [21]:
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df.sample(20)

Unnamed: 0,screen_name,text,account.type,class_type
803,kevinhookebot,"""The can achs some to dep in the leary of the ...",bot,rnn
14243,zawarbot,oi bruv take pics of learning how some coochie...,bot,others
9032,zawvrk,this is golden why does nobody understand it,human,human
19123,dril,people keep replying to my posts accusing me o...,human,human
1070,imranyebot,"I walked 5 miles today instead of the world, h...",bot,others
6521,nsp_gpt2,ninja brian just murdered someone with his min...,bot,gpt2
18677,JustinTrudeau,"As Premier and Senator, John Buchanan served N...",human,human
15929,zawvrk,<MENTION> wow I was just wishing for a pair of...,human,human
22556,narendramodi,It was an honour to have visited the Sree Sidd...,human,human
5568,realDonaldTrump,Tremendous Spirit! <URL>,human,human


## Exporting preprocessed dataset

In [22]:
if not os.path.exists(EXPORT_DATASET_DIR):
    os.makedirs(EXPORT_DATASET_DIR)

train_df.to_csv(os.path.join(EXPORT_DATASET_DIR, "tweepfake_train.csv"), index=False)
val_df.to_csv(os.path.join(EXPORT_DATASET_DIR, "tweepfake_val.csv"), index=False)
test_df.to_csv(os.path.join(EXPORT_DATASET_DIR, "tweepfake_test.csv"), index=False)

print(f"Datasets saved to {EXPORT_DATASET_DIR}")

Datasets saved to /content/drive/My Drive/deepfake_tweets/data/preprocessed
