In [15]:
import pandas as pd
import random
import csv
from nltk.corpus import wordnet, stopwords
import time


In [16]:
stop_words = set(stopwords.words('english'))
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [17]:
file_path = "../../Augmentation Data/semi_cleaned/LIAR.csv"
df_input = pd.read_csv(file_path)
df_input.drop(['Unnamed: 0'], inplace=True,axis=1)
df_input.head()

Unnamed: 0,clean_text,clean_title,label
0,says the annies list political group supports ...,says the annies list political group supports ...,0
1,when did the decline of coal start? it started...,when did the decline of coal start? it started...,1
2,"hillary clinton agrees with john mccain ""by vo...","hillary clinton agrees with john mccain ""by vo...",1
3,health care reform legislation is likely to ma...,health care reform legislation is likely to ma...,0
4,the economic turnaround started at the end of ...,the economic turnaround started at the end of ...,1


In [18]:
def calculate_stats(beginning, reset, now, i, trans_count, unchanged, num):
    per = (now-beginning)/(i+1)
    mega_time = (per)*(num - i)
    print("Stats every 100 items")
    print("Total translated = {}".format(trans_count))
    print("Total unchanged = {}".format(unchanged))  
    print("Overall time elapsed = {} seconds".format(now - beginning))
    print("Overall time since last calc = {} seconds".format(now-reset))
    print("estimated time left = {} seconds".format(mega_time))
    print()

In [21]:
df_output = df_input.copy()
num = df_input.shape[0]
unchanged = 0
sr_count = 0

beginning = time.time()
reset = beginning 
for i in range(num):
    og_text = df_input['clean_text'].iloc[i].split(' ')
    n = int(len(og_text)*.3)
    try:
        sr = ' '.join(synonym_replacement(og_text,n))
        df_output.loc[i] = [sr,df_input.clean_title[i],df_input.label[i]]
        sr_count += 1
    except:
        unchanged += 1
        pass
    if (i+1) % 500 == 0: #calculate stats every 500 
        calculate_stats(beginning, reset, time.time(), i, sr_count, unchanged, num)
        reset = time.time()

end = time.time()

Stats every 100 items
Total translated = 500
Total unchanged = 0
Overall time elapsed = 1.8068389892578125 seconds
Overall time since last calc = 1.8068389892578125 seconds
estimated time left = 44.41932971191406 seconds

Stats every 100 items
Total translated = 1000
Total unchanged = 0
Overall time elapsed = 3.82951283454895 seconds
Overall time since last calc = 2.022477626800537 seconds
estimated time left = 45.15761534500122 seconds

Stats every 100 items
Total translated = 1500
Total unchanged = 0
Overall time elapsed = 5.749829053878784 seconds
Overall time since last calc = 1.9201159477233887 seconds
estimated time left = 43.28471311759949 seconds

Stats every 100 items
Total translated = 2000
Total unchanged = 0
Overall time elapsed = 7.718814134597778 seconds
Overall time since last calc = 1.9687809944152832 seconds
estimated time left = 41.65072107028961 seconds

Stats every 100 items
Total translated = 2500
Total unchanged = 0
Overall time elapsed = 9.85198712348938 seconds


In [23]:
for i in range(5):
    print("Original")
    print(df_input['clean_text'].iloc[i])
    print("Synonym Replaced")
    print(df_output['clean_text'].iloc[i])
    print()

Original
says the annies list political group supports third-trimester abortions on demand.
Synonym Replaced
enunciate the annies listing political group supports third-trimester miscarriage on demand.

Original
when did the decline of coal start? it started when natural gas took off that started to begin in (president george w.) bushs administration.
Synonym Replaced
when did the decline of char start? it come out when instinctive brag subscribe off that come out to commence in (president george ii w.) bushs administration.

Original
hillary clinton agrees with john mccain "by voting to give george bush the benefit of the doubt on iran."
Synonym Replaced
sir edmund percival hillary dewitt clinton concord with king john mccain "by vote to give george bush the benefit of the doubt on iran."

Original
health care reform legislation is likely to mandate free sex change surgeries.
Synonym Replaced
health care reclaim legislation is likely to mandate innocent sex transfer surgeries.

Origin

In [24]:
df_output.to_csv("../../Augmentation Data/LIAR_SR.csv")