# Data Processing

## Imports

In [1]:
import pandas as pd
import string
import nltk
import operator
from nltk.corpus import stopwords

## Raw Data

In [2]:
def load(tsv_file):
    return pd.read_csv(tsv_file, header=0, sep="\t", index_col=False)

amazon = load("../Raw/amazon.tsv")
imdb   = load("../Raw/imdb.tsv")
yelp   = load("../Raw/yelp.tsv")

## Word Cloud Data

In [3]:
dict = {"amazon":{}, "imbd":{}, "yelp":{}}
punctuation_removal = str.maketrans(' ', ' ', string.punctuation)

def write(data, name, csv_file):
    
    temp = {}
    
    for idx, row in data.iterrows():
        for word in nltk.word_tokenize(row["review"].lower().translate(punctuation_removal)):
            
            if word in stopwords.words() or word.isdigit():
                continue
            
            if word not in temp:
                temp[word] = 0
            
            temp[word] += 1
    
    with open(csv_file, 'w') as file:
        file.write("source,word,freq\n")
        
        i = 0
        for t in sorted(temp.items(), key=operator.itemgetter(1), reverse=True):
            file.write(str(name + "," + t[0] + "," + str(t[1]) + "\n"))
            
            i += 1
            if i == 400:
                break

write(amazon, "amazon", "amazon_cloud.csv")
write(imdb,   "imdb",   "imdb_cloud.csv")
write(yelp,   "yelp",   "yelp_cloud.csv")