### Mount Google Drive to runtime

In [None]:
from google.colab import drive
from os.path import join

# Mounting location on runtime for GDrive
ROOT = '/content/drive'

# Project workspace on GDrive
PROJECT_PATH = 'My Drive/Github'

# Mount GDrive on the runtime
drive.mount(ROOT)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Install dependencies

In [None]:
!pip install nltk gensim textblob googletrans textaugment langdetect

!pip install wget
#!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/71/3a/3b19effdd4c03958b90f40fe01c93de6d5280e03843cc5adf6956bfc9512/googletrans-3.0.0.tar.gz
Collecting textaugment
  Downloading https://files.pythonhosted.org/packages/94/50/92c0ee2fd17132709a4bce114d49cc68ecc75697596fa1bce04f047258bc/textaugment-1.3.2-py3-none-any.whl
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 5.0MB/s 
Collecting httpx==0.13.3
[?25l  Downloading https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 6.4MB/s 
Collecting hstspreload
[?25l  Downloading https://files.pythonhosted.org/packages/d5/b9/a183078ac6eef7c65ff97ee3477616504bb377a2939613af595b97cbaac3/hstspreload-2020.

### Import packages

In [None]:
#import nltk
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

#import gensim
#model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

#from textaugment import Translate
from googletrans import Translator
from langdetect import detect

import pandas as pd
import re
import random
import time

# Set Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)   # Disable wrapping

### Main

In [None]:
# List of translation languages for augmentation
# For some reason 'zh-CN' amd 'zh-TW' do not work
languages = ['af', 'sq', 'ar', 'az', 'eu', 'bn', 'be', 'bg', 'ca', 'hr', \
             'cs', 'da', 'nl', 'eo', 'et', 'tl', 'fi', 'fr', 'gl', 'ka', 'de', \
             'el', 'gu', 'ht', 'iw', 'hi', 'hu', 'is', 'id', 'ga', 'it', 'ja', 'kn', 'ko', 'la', \
             'lv', 'lt', 'mk', 'ms', 'mt', 'no', 'fa', 'pl', 'pt', 'ro', 'ru', 'sr', 'sk', 'sl', \
             'es', 'sw', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'vi', 'cy', 'yi']

translator = Translator()
                
# Removes hyperlinks, newlines, tabs and @'s from a string
def cleanText(text):
    newText = str(text)
    newText = re.sub(r"http\S+", "", newText)
    newText = newText.replace(r'\n', ' ').replace(r'\t', ' ').replace('\n', ' ').replace('\t', ' ').replace('@', '')

    return newText


def augmentText(textList, translationDepth=3):
    textList2 = textList

    # Truncate texts down to 250 tokens
    for index, text in enumerate(textList):
        textTokens = text.split()
        if (len(textTokens) > 250):
            textList2[index] = ' '.join(textTokens[0:250])
        else:
            continue

    augmentedList = textList2

    # Continue to apply RTT until:
    # the translation depth is reached and the text is different from the original
    # or if 60 rounds of RTT have been applied
    counter = 0
    depthCounter = 0
    while ((depthCounter < translationDepth or augmentedList == textList2) and counter < 60):
        destLanguage = random.choice(languages)
        counter += 1
        
        try:
            translatedList = translator.translate(augmentedList, dest=destLanguage)
            #time.sleep(0.2)
            translatedList2 = translator.translate([translated.text for translated in translatedList], dest='en')
            #time.sleep(0.2)

            tempList = [translated.text for translated in translatedList2]
            print(tempList)
            print(depthCounter)
            
            # Discard translation if the leading and reply comments failed to translate back to English
            if (any(lang != 'en' for lang in [detect(temp) for temp in tempList][1:])):
                print(destLanguage)
                continue
        
        # Discard translation if an error occurs
        except:
            print('ERROR ENCOUNTERED')
            print(text)
            print(augmentedList)
            print(destLanguage)
            continue
        
        # Update augmentedList if translation is successful
        depthCounter += 1
        augmentedList = [translated.text for translated in translatedList2]

    print(textList)
    print(augmentedList)
    print()
    return augmentedList


def augmentDataframe(df, runs):
    dataDict = {'Title': [], 'Category': [], 'Post Author': [], 'Leading Comment': [], 'Reply Comments': [], 'Forum': []}

    # Determine translation depth based on the number of runs required
    if (runs > 50):
        translationDepth = 5
    elif (runs > 5 and runs < 50):
        translationDepth = 4
    else:
        translationDepth = 3

    # Augment all samples for the required number of runs
    for i in range(runs):
        for j in range(df.shape[0]):
            print('Run: ', i, ' of ', runs)
            print('Topic: ', j, ' of ', df.shape[0])
            print('Depth: ', translationDepth, ' Augmenting: ', df.iloc[0]['Category'])

            # Append original data
            dataDict['Forum'].append(df.iloc[j]['Forum'])
            dataDict['Category'].append(df.iloc[j]['Category'])
            dataDict['Post Author'].append(df.iloc[j]['Post Author'])

            # Augment the title, leading comment and reply comments via RTT
            if (df.iloc[j]['Reply Comments'] != ''):
                textList = [df.iloc[j]['Title'], df.iloc[j]['Leading Comment'], df.iloc[j]['Reply Comments']]
                augmentedList = augmentText(textList, translationDepth)

                # Append augmented data
                dataDict['Title'].append(augmentedList[0])
                dataDict['Leading Comment'].append(augmentedList[1])
                dataDict['Reply Comments'].append(augmentedList[2])

            # Add specialized code for if there are no reply comments
            else:
                textList = [df.iloc[j]['Title'], df.iloc[j]['Leading Comment']]
                augmentedList = augmentText(textList, translationDepth)

                dataDict['Title'].append(augmentedList[0])
                dataDict['Leading Comment'].append(augmentedList[1])
                dataDict['Reply Comments'].append('')
    
    return pd.DataFrame(dataDict)



# Read in Flowster and Amazon datasets as Pandas dataframes
flowsterDF = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/Flowster_Topic_Attributes_20200609181520.csv')
amazonDF = pd.read_csv('/content/drive/My Drive/Github/mlteam4/datasets/amazon_scraped_data.csv')

# Extract desired Amazon columuns
newAmazonDF = amazonDF[['Title', 'Category', 'Post Author', 'Leading Comment', 'Reply Comments']]
#print(newAmazonDF)

# Extract desired Flowster columuns and rename columns
newFlowsterDF = flowsterDF[['Topic Title', 'Category', 'Author', 'Leading Comment', 'Other Comments']]
newFlowsterDF = newFlowsterDF.rename(columns={'Topic Title':'Title', 'Author':'Post Author', 'Other Comments':'Reply Comments'})

# Remove the last row since the category only has one topic
#newFlowsterDF = newFlowsterDF.drop(index=260)
#print(newFlowsterDF)

# Add new 'Forum' column to dataframes
newFlowsterDF['Forum'] = 'Flowster'
newAmazonDF['Forum'] = 'Amazon'

# Merge new dataframes
mergedDF = newFlowsterDF.append(newAmazonDF, ignore_index=True)
#print(mergedDF)

# Super complicated string processing to combine reply comments properly
mergedDF['Reply Comments'] = mergedDF['Reply Comments'].apply(lambda x : ' '.join(x.split("', '")).replace("'", "’").strip('[]’'))

# Clean all text
mergedDF['Title'] = mergedDF['Title'].apply(cleanText)
mergedDF['Category'] = mergedDF['Category'].apply(cleanText)
mergedDF['Post Author'] = mergedDF['Post Author'].apply(cleanText)
mergedDF['Leading Comment'] = mergedDF['Leading Comment'].apply(cleanText)
mergedDF['Reply Comments'] = mergedDF['Reply Comments'].apply(cleanText)

#print(mergedDF['Reply Comments'])
#print(mergedDF.loc[61, 'Reply Comments'])

print(mergedDF['Category'].value_counts())

# Augmented categories with less than 198 samples up to that number
for name, categoryDF in mergedDF.groupby(['Category']):
    if (categoryDF.shape[0] < 198):
        runs = int(198/categoryDF.shape[0])
        augmentedCategoryDF = augmentDataframe(categoryDF, runs)
        mergedDF = mergedDF.append(augmentedCategoryDF, ignore_index=True)

# Save final dataframe to CSV
mergedDF.to_csv('/content/drive/My Drive/Github/mlteam4/datasets/augmented.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2
['The best tools for sending email to Amazon customers?', 'We all know how powerful it can be to send email to our Amazon customers, for reasons like: Ask for reviews Offering promotions for new products Managing various customer support problems Just curious about the tools you use to automate this?', "We are switching to Helium 10. Since we are already using all their other great tools, this is a great value. But we have used many other devices that have only been emailed over the past, most recently Feedback Five. We used to link Amazon Seller Central to MailChimp and I found this feature really easy. We also tried to connect eSputnik with Zapier and Amazon Seller Central. This arrangement is more complicated but also flexible. In addition, I like the editor's email to eSputnik, this builder is easy to use and intuitive."]
3
['Best tools for emailing Amazon customers?', 'We all know how powerful it can be to be able 