In [25]:
import pandas as pd
import random
import csv
from nltk.corpus import wordnet, stopwords
import time


In [26]:
stop_words = set(stopwords.words('english'))
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [27]:
file_path = "../../Augmentation Data/semi_cleaned/Kaggle2_Mixed.csv"
df_input = pd.read_csv(file_path)
df_input.drop(['Unnamed: 0'], inplace=True,axis=1)
df_input.head()

Unnamed: 0,clean_text,clean_title,label
0,house dem aide: we didnt even see comeys lette...,house dem aide: we didnt even see comeys lette...,0
1,ever get the feeling your life circles the rou...,"flynn: hillary clinton, big woman on campus - ...",1
2,"why the truth might get you fired october 29, ...",why the truth might get you fired,0
3,videos 15 civilians killed in single us airstr...,15 civilians killed in single us airstrike hav...,0
4,print \nan iranian woman has been sentenced to...,iranian woman jailed for fictional unpublished...,0


In [28]:
def calculate_stats(beginning, reset, now, i, trans_count, unchanged, num):
    per = (now-beginning)/(i+1)
    mega_time = (per)*(num - i)
    print("Stats every 100 items")
    print("Total translated = {}".format(trans_count))
    print("Total unchanged = {}".format(unchanged))  
    print("Overall time elapsed = {} seconds".format(now - beginning))
    print("Overall time since last calc = {} seconds".format(now-reset))
    print("estimated time left = {} seconds".format(mega_time))
    print()

In [29]:
df_input.shape

(20127, 3)

In [30]:
df_output = df_input.copy()
num = df_input.shape[0]
unchanged = 0
sr_count = 0

beginning = time.time()
reset = beginning 
for i in range(num):
    og_text = df_input['clean_text'].iloc[i].split(' ')
    n = int(len(og_text)*.3)
    try:
        sr = ' '.join(synonym_replacement(og_text,n))
        df_output.loc[i] = [sr,df_input.clean_title[i],df_input.label[i]]
        sr_count += 1
    except:
        unchanged += 1
        pass
    if (i+1) % 500 == 0: #calculate stats every 500 
        calculate_stats(beginning, reset, time.time(), i, sr_count, unchanged, num)
        reset = time.time()

end = time.time()

Stats every 100 items
Total translated = 500
Total unchanged = 0
Overall time elapsed = 24.316727876663208 seconds
Overall time since last calc = 24.316727876663208 seconds
estimated time left = 954.5774695262909 seconds

Stats every 100 items
Total translated = 1000
Total unchanged = 0
Overall time elapsed = 45.27107501029968 seconds
Overall time since last calc = 20.95409393310547 seconds
estimated time left = 865.9451227970123 seconds

Stats every 100 items
Total translated = 1500
Total unchanged = 0
Overall time elapsed = 63.76449394226074 seconds
Overall time since last calc = 18.49316668510437 seconds
estimated time left = 791.869995437622 seconds

Stats every 100 items
Total translated = 2000
Total unchanged = 0
Overall time elapsed = 84.59725403785706 seconds
Overall time since last calc = 20.832524061203003 seconds
estimated time left = 766.7895105991364 seconds

Stats every 100 items
Total translated = 2500
Total unchanged = 0
Overall time elapsed = 105.28888511657715 seconds

Stats every 100 items
Total translated = 19000
Total unchanged = 0
Overall time elapsed = 752.5265791416168 seconds
Overall time since last calc = 17.587223052978516 seconds
estimated time left = 44.67631480377599 seconds

Stats every 100 items
Total translated = 19500
Total unchanged = 0
Overall time elapsed = 769.0314540863037 seconds
Overall time since last calc = 16.50464916229248 seconds
estimated time left = 24.766756572625575 seconds

Stats every 100 items
Total translated = 20000
Total unchanged = 0
Overall time elapsed = 787.6201961040497 seconds
Overall time since last calc = 18.588496923446655 seconds
estimated time left = 5.040769255065918 seconds



In [33]:
i = 0
print("Original")
print(df_input['clean_text'].iloc[i])
print("-------------------------------")
print("Synonym Replaced")
print(df_output['clean_text'].iloc[i])
print()

Original
house dem aide: we didnt even see comeys letter until jason chaffetz tweeted it by darrell lucus on october 30, 2016 subscribe jason chaffetz on the stump in american fork, utah ( image courtesy michael jolley, available under a creative commons-by license) 
with apologies to keith olbermann, there is no doubt who the worst person in the world is this weekfbi director james comey. but according to a house democratic aide, it looks like we also know who the second-worst person is as well. it turns out that when comey sent his now-infamous letter announcing that the fbi was looking into emails that may be related to hillary clintons email server, the ranking democrats on the relevant committees didnt hear about it from comey. they found out via a tweet from one of the republican committee chairmen. 
as we now know, comey notified the republican chairmen and democratic ranking members of the house intelligence, judiciary, and oversight committees that his agency was reviewing ema

In [37]:
total = int(df_output.shape[0]/2)
print(total)
df1 = df_output[:total]
df2 = df_output[total:]
df1.reset_index(inplace=True, drop=True)
df2.reset_index(inplace=True, drop=True)
print(df1.shape)
df1.head()

# df_output.to_csv("../../Augmentation Data/Kaggle2_Mixed_SR.csv")

10063
(10063, 3)


Unnamed: 0,clean_text,clean_title,label
0,firm dem aide: we didnt tied learn comeys lett...,house dem aide: we didnt even see comeys lette...,0
1,eer perplex the tactile sensation your livelin...,"flynn: hillary clinton, big woman on campus - ...",1
2,why the accuracy power dumbfound you provoke o...,why the truth might get you fired,0
3,tv civilian wipe out in unmarried the states ...,15 civilians killed in single us airstrike hav...,0
4,print \nan irani womanhood has been sentenced ...,iranian woman jailed for fictional unpublished...,0


In [38]:
df2.shape

(10064, 3)

In [39]:
df1.to_csv("../../Augmentation Data/Kaggle2_Mixed_SR_a.csv")
df2.to_csv("../../Augmentation Data/Kaggle2_Mixed_SR_b.csv")