# IMPORTS

In [1]:
import json
import pandas as pd
import subprocess as SB
import xml.etree.ElementTree as ET

In [2]:
data = pd.read_csv('nixon_shap.csv', sep='\t', encoding = 'utf-8')
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Title,Review,Rating,Prediction,Shap_json
0,0,WOLVES,Will not retain proper day of week,I followed the instructions out of box to set ...,1.0,0.60806,"[{""shap"":-0.705659351,""term"":""disappointed"",""o..."
1,1,yvm_22,Not what I expected.,Only 11 and 12 line positions illuminate when ...,2.0,1.897487,"[{""shap"":-0.5358719612,""term"":""missing"",""occur..."
2,2,Amazon Customer,I absolutely love everything about the aesthet...,I absolutely love everything about the aesthet...,3.0,2.859639,"[{""shap"":-0.9677056434,""term"":""broken"",""occurr..."
3,3,Tyler Archibald,Days didn't work at first,When I first opened the box and tried setting ...,4.0,3.842108,"[{""shap"":0.8345123571,""term"":""tue"",""occurrence..."
4,4,A. Velasco,Beautiful watch but glitchy date mechanism,this watch was given to me as a gift and i was...,4.0,4.020531,"[{""shap"":-0.4751183521,""term"":""aligned"",""occur..."


# HELPERS

In [3]:
"""
    Returns the result of the method evaluate(...), which is the middle part of the html
    @text_review: The original text review
    @shap_string The shap (json) information as a string
"""
def getMiddle(text_review, shap_string):
    result = evaluate(text_review, shap_string)
    return result

In [4]:
"""
    Turns a string into JSON
    @shap_string : The shap (json) information as a string
"""
def parseShap(shap_string):
    return json.loads(shap_string)

In [5]:
"""
    Checks if a word has a shap value
    @list : Shap information
    @term : A term as it appears in the original review text
"""
def find_dictionary(list, term):    
    for x in range(0, len(list)):
        if (list[x]['term'] == term.lower()) or (term[0:-1].lower() == list[x]['term'] and term[-1] == 's'):
            return list[x]
        elif (term[-3:] == "n't" and term[0:-2].lower() == list[x]['term']):
            return list[x]
        elif (term.lower() == "watches") and (list[x]['term'] == 'watch'):
            return list[x]
    
    return None

In [6]:
"""
    Returns the html code with highlights around 
    @input : the original review text
    @shap : the shap json, containing shap information for the review text
"""
def evaluate(input, shap):
    parts = custom_split(input)
    output = ""
    for part in parts:
        parse = parseShap(shap)
        s = find_dictionary(parse, part)
        if s is not None:
            element = invoke_paragraph(s, part)
            output = output + element
        else: output = output + part
    return output

In [7]:
"""
    Returns the review text split in parts based on special signs
"""
def custom_split(input):
    array = ['.', '?', '!', ',', ' ', '-', '"']
    result = []
    length = len(input)

    word = ''

    for i in range(len(input)):
        char = input[i]
        if char in array:
            if word != '' :
                result.append(word)
                word = ""
            result.append(char)
        else:
            word += char
            
    if word != ' ' : 
        result.append(word)

    return result

In [8]:
def invoke_paragraph(dictionary, part):
    factor = dictionary['factor']
    identifier = dictionary['id']
    orientation = dictionary['orientation']
        
    return highlight_part(factor, identifier, orientation, part)

In [9]:
"""
    Creates a text highlighting and returns the html code for the text highlighting
"""
def highlight_part(shapvalue, identifier, orientation, term) :
    factor = round(abs(shapvalue),2)
    orientation = lambda x : "positive" if (x > 0) else "negative"
    
    return "<span class=\"highlighter " + identifier + "\" data-shap=\"" + str(factor) + "\" data-orientation=\"" + orientation(shapvalue) + "\">" + term + "</span>";

In [10]:
def getOrientation(shap_json):
    empty = {}
    for el in shap_json:
        term = el['term']
        orientation = el['orientation']
        empty[term] = orientation
    return empty

In [11]:
def getIDs(shap_json):
    empty = {}
    for el in shap_json:
        term = el['term']
        identifier = el['id']
        empty[term] = identifier
    return empty

In [12]:
def getFactors(shap_json):
    empty = {}
    for el in shap_json:
        term = el["term"]
        factor = el["factor"]
        empty[term] = factor
    return empty

In [13]:
def getOccurrence(shap_json):
    empty = {}
    for el in shap_json:
        term = el["term"]
        factor = el["occurrence"]
        empty[term] = factor
    return empty

## GENERATE INPUT FILE cloud.jar

In [14]:
def createInputFile(factors, occurrences, filename):
    file = filename + ".txt"
    words = ""
    for key in factors:
        occ = occurrences[key]
        if (occ != 0):
            v = factors[key]
            size = (float(v) * 100)    
            for i in range(1, int(size)):
                words += key + " "
            words += ". "

    with open(file, "w") as text_file:
        text_file.write(words)

## GENERATE SVG

In [15]:
def generateSVG(filename):
    file = filename + ".txt"
    args = ['java', '-jar', 'cloudy.jar', '-w1200', '-h500', '-Lmds', '-ps', '-pg', '-pn', '-O', file]
    SB.call(args)

## PARSE SVG

In [16]:
def parseSVG(filename, ids, orientations):
    file = filename + ".svg"
    root = ET.parse(file).getroot()
    internal = root[1]
    result = '<svg width="500" height="275" font-family="Arial" fill="#a9a9a9" font-style="normal" font-weight="bold"> <g transform="scale(0.5)">'

    for child in internal:
        attributes = child.attrib
        term = child[0]

        transform = attributes['transform']
        x = term.attrib['x']
        y = term.attrib['y']
        size = attributes['font-size']
        text = term.text
        orientation = orientations[term.text]

        fill = 'black'
        if (orientation == 'positive'):
            fill = 'rgb(44, 160, 44)'
        elif (orientation == 'negative'):
            fill = 'rgb(214, 39, 40)'

        id_tag = ids[term.text]
        new_tag = '<g transform="' + transform + '" font-size="' + str(size) + '" fill="' + fill + '" onclick="highlight(`' + id_tag + '`, `' + orientation + '`)' +'">' + '<text x="' + x + '" y="' + y + '">' + term.text + '</text></g>'
        result = result + new_tag
    result = result + '</g></svg>'
    return result

# RUN SCRIPT

In [17]:
result_list = []
filename = "test_cloud"
html_col = []

header = '<p id="review_text" class="review_result">'
footer = '</p>'

for i in data.index:
    row = data.loc[i]
    
    # SETUP
    shap = row.Shap_json    
    review_text = row.Review
       
    # HIGHLIGHTS IN REVIEW TEXT
    middle = getMiddle(review_text, shap)
    html = header + middle + footer
    html_col.append(html)
        
    # CREATE INPUT FILE (word cloud)
    s = json.loads(shap)
    factors = getFactors(s)
    occurrences = getOccurrence(s)
    createInputFile(factors, occurrences, filename)
    
    # CREATE WORD CLOUDS
    generateSVG(filename)
    orientations = getOrientation(s)
    ids = getIDs(s)
    result = parseSVG(filename, ids, orientations)
    result_list.append(result)
    
    # RESET
    shap = ""
    review_text = ""

In [18]:
data["Highlights"] = html_col
data["Cloud"] = result_list
del data['Prediction']
del data['Shap_json']

data.to_csv('nixon_html.csv', sep='\t', index=False)
data

Unnamed: 0.1,Unnamed: 0,Name,Title,Review,Rating,Highlights,Cloud
0,0,WOLVES,Will not retain proper day of week,I followed the instructions out of box to set ...,1.0,"<p id=""review_text"" class=""review_result"">I fo...","<svg width=""500"" height=""275"" font-family=""Ari..."
1,1,yvm_22,Not what I expected.,Only 11 and 12 line positions illuminate when ...,2.0,"<p id=""review_text"" class=""review_result"">Only...","<svg width=""500"" height=""275"" font-family=""Ari..."
2,2,Amazon Customer,I absolutely love everything about the aesthet...,I absolutely love everything about the aesthet...,3.0,"<p id=""review_text"" class=""review_result"">I ab...","<svg width=""500"" height=""275"" font-family=""Ari..."
3,3,Tyler Archibald,Days didn't work at first,When I first opened the box and tried setting ...,4.0,"<p id=""review_text"" class=""review_result"">When...","<svg width=""500"" height=""275"" font-family=""Ari..."
4,4,A. Velasco,Beautiful watch but glitchy date mechanism,this watch was given to me as a gift and i was...,4.0,"<p id=""review_text"" class=""review_result"">this...","<svg width=""500"" height=""275"" font-family=""Ari..."
5,5,Thuan Vu,"Sentry leather, my second Nixon.","Similar to my other sentry, this watch keeps p...",4.0,"<p id=""review_text"" class=""review_result""><spa...","<svg width=""500"" height=""275"" font-family=""Ari..."
6,6,Spacecookies4,Generally very happy with this watch,Generally very happy with this watch. Get a lo...,4.0,"<p id=""review_text"" class=""review_result""><spa...","<svg width=""500"" height=""275"" font-family=""Ari..."
7,7,Matt Holloway,Noice!,"I have many watches, and this one is my favori...",5.0,"<p id=""review_text"" class=""review_result"">I ha...","<svg width=""500"" height=""275"" font-family=""Ari..."
8,8,Kevin Simpson,"Solid Watch, Looks Phenomenal",This is the first watch I've bought for myself...,5.0,"<p id=""review_text"" class=""review_result"">This...","<svg width=""500"" height=""275"" font-family=""Ari..."
9,9,Amazon Customer,Good watch,The adjustable wrist strap was top-notch and t...,5.0,"<p id=""review_text"" class=""review_result"">The ...","<svg width=""500"" height=""275"" font-family=""Ari..."


In [19]:
data.to_json('nixon.json', orient='records')