In [1]:
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm



def augment_sentence(sentence, aug, num_threads):
    """""""""
    Constructs a new sentence via text augmentation.
    
    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library
        - num_threads:  Integer controlling the number of threads to use if
                        augmenting text via CPU
    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence, num_thread=num_threads)
    


def augment_text(df, aug, num_threads, num_times):
    """""""""
    Takes a pandas DataFrame and augments its text data.
    
    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'comment_text' containing strings of text to augment.
                                - 'isToxic' binary target variable containing 0's and 1's.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data 
                         appended to it and with rows randomly shuffled.
    """""""""
    
    # Get rows of data to augment
    to_augment = df[df['is_evidence']==1]
    to_augmentX = to_augment['rumor'] + " [SEP] " + to_augment['timeline']
    to_augmentY = np.ones(len(to_augmentX.index) * num_times, dtype=np.int8)
    
    # Build up dictionary containing augmented data
    aug_dict = {'comment_text':[], 'is_evidence':to_augmentY}
    for i in tqdm(range(num_times)):
        augX = [augment_sentence(x, aug, num_threads) for x in to_augmentX]
        aug_dict['comment_text'].extend(augX)
    
    # Build DataFrame containing augmented data
    aug_df = pd.DataFrame.from_dict(aug_dict)
    
    return aug_df
    

    
# Define nlpaug augmentation object 
aug10p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.1, action="substitute")

downsampled_df = pd.read_csv("train_clean.csv")
downsample = False

if downsample:
    t0 = downsampled_df[downsampled_df['is_evidence']==0][0:20]
    t1 = downsampled_df[downsampled_df['is_evidence']==1][0:3]
    downsampled_df = pd.concat([t0, t1]).reset_index(drop=True)

num_times = int(downsampled_df['is_evidence'].value_counts()[0]/downsampled_df['is_evidence'].value_counts()[1])
print("Orignal Data: ", downsampled_df['is_evidence'].value_counts())
print("Num of Times Data Augmentation is to be Done: ", num_times)

# Upsample minority class ('isToxic' == 1) to create a roughly 50-50 class distribution
balanced_df = augment_text(downsampled_df, aug10p, num_threads=8, num_times=num_times)

Orignal Data:  is_evidence
0    22872
1      276
Name: count, dtype: int64
Num of Times Data Augmentation is to be Done:  82


100%|██████████████████████████████████████████████████████████████████████████████| 82/82 [11:52:09<00:00, 521.10s/it]


In [2]:
def df_format_rumor(df):
    if " [SEP] " in df[0]:
        return df[0].split(" [SEP]")[0]
def df_format_timeline(df):
    if " [SEP] " in df[0]:
        return df[0].split(" [SEP]")[0]
balanced_df['rumor'] = balanced_df['comment_text'].apply(df_format_rumor)
balanced_df['timeline'] = balanced_df['comment_text'].apply(df_format_timeline)

balanced_df.drop(['comment_text'], axis=1, inplace=True)
balanced_df.dropna(inplace=True)
new_train_df = pd.concat([downsampled_df, balanced_df]).reset_index(drop=True)

In [3]:
downsampled_df

Unnamed: 0,rumor,is_evidence,timeline
0,“#Urgent / Ramallah Ministry of Health spokesm...,0,"""Qalqilya (5), Bethlehem (10), Nablus (24), Je..."
1,“#Urgent / Ramallah Ministry of Health spokesm...,0,"Salfit (49), Jerusalem suburbs (61), Tubas (18..."
2,“#Urgent / Ramallah Ministry of Health spokesm...,0,“While the percentage of active infections rea...
3,“#Urgent / Ramallah Ministry of Health spokesm...,0,Epidemiological report on the Corona virus in ...
4,“#Urgent / Ramallah Ministry of Health spokesm...,0,"“Salfit (4), Jericho and Al-Aghwar (5), Nablus..."
...,...,...,...
23143,"Officially, Mortada Mansour returns as preside...",0,"Dr. Ashraf Sobhi, Minister of Youth and Sports..."
23144,"Officially, Mortada Mansour returns as preside...",0,"Dr. Ashraf Sobhi, Minister of Youth and Sports..."
23145,"Officially, Mortada Mansour returns as preside...",0,The Ministry of Youth and Sports conducts phys...
23146,"Officially, Mortada Mansour returns as preside...",0,Egypt wins for the second time in a row organi...


In [4]:
balanced_df

Unnamed: 0,is_evidence,rumor,timeline
0,1,“ # urgent / ramallah ministry of health spoke...,“ # urgent / ramallah ministry of health spoke...
2,1,“ # urgent / ramallah ministry of health spoke...,“ # urgent / ramallah ministry of health spoke...
3,1,... # urgent / ramallah ministry of health spo...,... # urgent / ramallah ministry of health spo...
4,1,macron via sky news : after my visit for mrs. ...,macron via sky news : after my visit for mrs. ...
5,1,macron to sky news : after a visit to mrs. fai...,macron to sky news : after a visit to mrs. fai...
...,...,...,...
22626,1,# urgent # [UNK].. the young # yemeni # americ...,# urgent # [UNK].. the young # yemeni # americ...
22627,1,by presidential order.. canceling the eliminat...,by presidential order.. canceling the eliminat...
22628,1,two residents working for one of the companies...,two residents working for one of the companies...
22630,1,saudi arabia cancels annual health restriction...,saudi arabia cancels annual health restriction...


In [5]:
new_train_df

Unnamed: 0,rumor,is_evidence,timeline
0,“#Urgent / Ramallah Ministry of Health spokesm...,0,"""Qalqilya (5), Bethlehem (10), Nablus (24), Je..."
1,“#Urgent / Ramallah Ministry of Health spokesm...,0,"Salfit (49), Jerusalem suburbs (61), Tubas (18..."
2,“#Urgent / Ramallah Ministry of Health spokesm...,0,“While the percentage of active infections rea...
3,“#Urgent / Ramallah Ministry of Health spokesm...,0,Epidemiological report on the Corona virus in ...
4,“#Urgent / Ramallah Ministry of Health spokesm...,0,"“Salfit (4), Jericho and Al-Aghwar (5), Nablus..."
...,...,...,...
42705,# urgent # [UNK].. the young # yemeni # americ...,1,# urgent # [UNK].. the young # yemeni # americ...
42706,by presidential order.. canceling the eliminat...,1,by presidential order.. canceling the eliminat...
42707,two residents working for one of the companies...,1,two residents working for one of the companies...
42708,saudi arabia cancels annual health restriction...,1,saudi arabia cancels annual health restriction...


In [6]:
print("Orignal Data: ", downsampled_df['is_evidence'].value_counts())
print("Enhanced Data: ", new_train_df['is_evidence'].value_counts())

Orignal Data:  is_evidence
0    22872
1      276
Name: count, dtype: int64
Enhanced Data:  is_evidence
0    22872
1    19838
Name: count, dtype: int64


In [12]:
new_train_df = new_train_df.sample(frac=1).reset_index(drop=True)
new_train_df.to_csv("./data/new_train_clean_augmented.csv", index=False)

In [13]:
new_train_df

Unnamed: 0,rumor,is_evidence,timeline
0,🚨- Okaz: Qatar Airways is an official sponsor ...,0,ISSUE: couldn't translate
1,Macron to Sky News: After my visit to Mrs. Fai...,0,Beirut night searching for a pulse.. https://t...
2,The official spokesman for the Football Associ...,0,Watch Hussein Al-Shahat meet his friends with ...
3,"🛑The Guardian"" warns of the outbreak of a dead...",0,UNEP opens nominations for prestigious Champio...
4,The World Health Organization calls for stoppi...,0,"#Tigray UN agencies indicate that 100,000 chil..."
...,...,...,...
42705,[UNK] : storming of the headquarters of channe...,1,[UNK] : storming of the headquarters of channe...
42706,The Mufti of Turkey: The Arabs’ attacks on Erd...,0,ISSUE: couldn't translate
42707,🚨- Okaz: Qatar Airways is an official sponsor ...,0,ISSUE: couldn't translate
42708,#Erdogan: We will teach the glorious June 30 r...,0,The Cassation Prosecution recommends - in its ...
