# Data Processing

## Imports

In [1]:
import json
import pandas as pd

import nltk
from pycorenlp import StanfordCoreNLP

## Raw Data

In [2]:
def load(tsv_file):
    return pd.read_csv(tsv_file, header=0, sep="\t", index_col=False)

amazon = load("../Raw/amazon.tsv")
imdb   = load("../Raw/imdb.tsv")
yelp   = load("../Raw/yelp.tsv")

## Subject and Opinion

In [3]:
# Extract (subject, opinion, sentiment) triplet.

# Start server first, remember to shut down when done.
# java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000
nlp = StanfordCoreNLP('http://localhost:9000')

def get_subject_opinion_sentiment(sentence, sentiment):
    
    # [[Subject, Opinion, Sentiment], ...]
    triplet = []
    
    # Stanford Sentence Parser.
    res = nlp.annotate(sentence.translate(str.maketrans(',.', '  ')).lower(),
                       properties={
            'annotators': 'depparse',
            'outputFormat': 'json',
            'timeout': 100000,
        })
    
    if len(res["sentences"]) == 0:
        return triplet
    
    current = []
    negation = []
    
    for x in res["sentences"][0]["basicDependencies"]:
        
        # If negation found.
        if 'neg' in x["dep"]:
            negation.append([x["governorGloss"], x["dependentGloss"]])
            continue
        
        # If dependency found.
        if 'mod' in x["dep"] or 'dep' in x["dep"]:
            
            # If governor is a noun and depedent is an adjective.                
            if 'NN' in nltk.pos_tag([x["governorGloss"]])[0][1] and 'JJ' in nltk.pos_tag([x["dependentGloss"]])[0][1]:
                current.append([x["governorGloss"], x["dependentGloss"], sentiment])
            continue
        
        # If relation found.
        if 'nsubj' in x["dep"]:
            
            # If governor is an adjective and dependent is a noun.
            if 'JJ' in nltk.pos_tag([x["governorGloss"]])[0][1] and 'NN' in nltk.pos_tag([x["dependentGloss"]])[0][1]:
                current.append([x["dependentGloss"], x["governorGloss"], sentiment])
            continue
    
    # Add negations.
    for n in negation:
        for i in range(len(current)):
            if n[0] == current[i][1]:
                current[i][1] = n[1] + " " + current[i][1]
    
    # Append to list.
    triplet.extend(current)
    
    return triplet

## Tree Chart Data

In [4]:
def write(data, json_file, N):
    sentences  = [ str(s) for s in data['review'].values.tolist() ]
    sentiments = [ int(i) for i in data['sentiment'].values.tolist() ]
    
    subject_dict = {}
    for i in range(len(sentences)):
        triplets = get_subject_opinion_sentiment(sentences[i], sentiments[i])
        
        for t in triplets:
            if t[0] not in subject_dict:
                subject_dict[t[0]] = {}
            
            if t[1] not in subject_dict[t[0]]:
                subject_dict[t[0]][t[1]] = [0, 0] # [Negative, Positive]
            
            subject_dict[t[0]][t[1]][t[2]] += 1
    
    subject_json = []
    for k in subject_dict.keys():
        
        node_children = []
        node_sentiment = [0, 0]
        
        for c in subject_dict[k]:
            node_children.append({"name": c, "sentiment": subject_dict[k][c]})
            node_sentiment[0] += subject_dict[k][c][0]
            node_sentiment[1] += subject_dict[k][c][1]
        
        subject_json.append({"name": k, "sentiment": node_sentiment, "children": node_children})
        
    subject_json = sorted(subject_json, key=lambda x: x["sentiment"][0] + x["sentiment"][1], reverse=True)[0:min(len(subject_json), N)]
    
    root_sentiment = [0, 0]
    
    for s in subject_json:
        root_sentiment[0] += s["sentiment"][0]
        root_sentiment[1] += s["sentiment"][1]
        
    subject_json = {
        "name": "root",
        "sentiment": root_sentiment,
        "children": subject_json
    }
    
    with open(json_file, 'w') as file:
        json.dump(subject_json, file)

write(amazon, "amazon_tree.json", 5)
write(imdb, "imdb_tree.json", 5)
write(yelp, "yelp_tree.json", 5)