# Data Processing

## Imports

In [1]:
import json
import pandas as pd

import nltk
from pycorenlp import StanfordCoreNLP

## Raw Data

In [2]:
def load(tsv_file):
    return pd.DataFrame.from_csv(tsv_file, sep="\t", header=None, index_col=False)

amazon = load("Data/amazon_cells_labelled.tsv")
imdb   = load("Data/imdb_labelled.tsv")
yelp   = load("Data/yelp_labelled.tsv")

## Subject and Opinion

In [3]:
# Extract (subject, opinion, sentiment) triplet.

# Start server first, remember to shut down when done.
# java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000
nlp = StanfordCoreNLP('http://localhost:9000')

def get_subject_opinion_sentiment(sentence, sentiment):
    
    # [[Subject, Opinion, Sentiment], ...]
    triplet = []
    
    # Stanford Sentence Parser.
    res = nlp.annotate(sentence.translate(str.maketrans(',.', '  ')).lower(),
                       properties={
            'annotators': 'depparse',
            'outputFormat': 'json',
            'timeout': 100000,
        })
    
    if len(res["sentences"]) == 0:
        return triplet
    
    current = []
    negation = []
    
    for x in res["sentences"][0]["basicDependencies"]:
        
        # If negation found.
        if 'neg' in x["dep"]:
            negation.append([x["governorGloss"], x["dependentGloss"]])
            continue
        
        # If dependency found.
        if 'mod' in x["dep"] or 'dep' in x["dep"]:
            
            # If governor is a noun and depedent is an adjective.                
            if 'NN' in nltk.pos_tag([x["governorGloss"]])[0][1] and 'JJ' in nltk.pos_tag([x["dependentGloss"]])[0][1]:
                current.append([x["governorGloss"], x["dependentGloss"], sentiment])
            continue
        
        # If relation found.
        if 'nsubj' in x["dep"]:
            
            # If governor is an adjective and dependent is a noun.
            if 'JJ' in nltk.pos_tag([x["governorGloss"]])[0][1] and 'NN' in nltk.pos_tag([x["dependentGloss"]])[0][1]:
                current.append([x["dependentGloss"], x["governorGloss"], sentiment])
            continue
    
    # Add negations.
    for n in negation:
        for i in range(len(current)):
            if n[0] == current[i][1]:
                current[i][1] = n[1] + " " + current[i][1]
    
    # Append to list.
    triplet.extend(current)
    
    return triplet

## Bubble Chart Data

In [4]:
def write(data, json_file, N = 40):
    sentences  = [ str(s) for s in data[0].values.tolist() ]
    sentiments = [ int(i) for i in data[1].values.tolist() ]
    
    pair_dict = {}
    for i in range(len(sentences)):
        triplets = get_subject_opinion_sentiment(sentences[i], sentiments[i])
        
        for t in triplets:
            if t[0] not in pair_dict:
                pair_dict[t[0]] = [0, 0]
            pair_dict[t[0]][sentiments[i]] += 1
    
    pair_json = []
    for k in pair_dict.keys():
        pair_json.append({
                "subject"    : k,
                "count"      : pair_dict[k][0] + pair_dict[k][1],
                "sentiments" : [
                    {"sentiment" : "negative", "count" : pair_dict[k][0]},
                    {"sentiment" : "positive", "count" : pair_dict[k][1]}
                ]
            })
    
    pair_json = sorted(pair_json, key=lambda k: k["count"], reverse=True)
    
    if N > 0:
        pair_json = pair_json[0:min(len(pair_json), N)]
    
    with open(json_file, 'w') as file:
        json.dump(pair_json, file)

write(amazon, "Data/amazon_bubble.json", 40)
write(imdb,   "Data/imdb_bubble.json",   40)
write(yelp,   "Data/yelp_bubble.json",   40)