# Clarity Scoring Function


In [1]:
import sys
import re
# import os.path
from os import makedirs
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from numpy import log
from scipy.special import rel_entr
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ccche\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
header_columns = 'Author_ID|Author_name|Comment_number|Sentence_number|Domain_Relevance|Sentiment|Entity|Attribute|Sentence|Source_file|Annotator|Aspect'.split('|')
# domain_relevance_classes = ['0', '9']
# entity_classes = ['g', 'p', 'f', 'c', 'cg', 'cp', 'cf', 'cc', 'gg']
attribute_classes = ['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
coarse_attributes_classes = ['g', 'p', 'eq', 'sh','ts', 'e']
fine_coarse_attributes_classes = [['g'],['p'],['t','q'],['s','h','c'],['ll','or','l','av'],['e','a','pp']]

# sentiment_classes = ['p', 'n', '0']
# print(len(attribute_classes) * len(entity_classes))

In [3]:
%%time
df = pd.read_csv('../dataframe.csv', sep='|', names = header_columns)
print(len(df))
df.sample(5)

8824
Wall time: 89 ms


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
4542,0,Anonymous,964,2,9,0,f,g,The thing with organic farming is that one nee...,quora.json,sebastian,f-g
1483,Iida-Ruishalme,Iida Ruishalme,1699,18,0,,,,Pesticides all have to be some kind of chemica...,quora.json,felix,nan-nan
1070,Justine-Kimball,Justine Kimball,123,4,9,p,c,h,"That said, I love Whole Foods and their effort...",quora.json,hannah,c-h
5679,Shellie-Bowdoin,Shellie Bowdoin,941,5,0,,,,"Unfortunately, there are a lot of hidden eleme...",quora.json,sebastian,nan-nan
7153,Craig-Good,Craig Good,1240,1,9,0,p,g,"Maybe, but not because they’re organic.",quora.json,florian,p-g


In [4]:
# taking only relvant
df_relevant = df.loc[df['Domain_Relevance'] == '9']
print(len(df_relevant))
df_relevant.sample(5)

4687


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
1209,Kari-Lloyd,Kari Lloyd,772,16,9,p,p,h,"Organic grains, beans and pastas all make quic...",quora.json,sarthak,p-h
5010,Ryan-Carlyle,Ryan Carlyle,751,2,9,n,p,q,I worked in a grocery store produce department...,quora.json,sarthak,p-q
6361,Seema-Sharma-246,Seema Sharma,1322,6,9,n,g,g,"At this point of time, organic foods were defi...",quora.json,florian,g-g
7886,0,Anonymous,1610,1,9,p,p,h,A well-balanced organic cat food diet ensures ...,quora.json,omar,p-h
4818,Anju-95,Anju,29,2,9,p,p,av,People can buy the varieties of unpolished and...,quora.json,hannah,p-av


In [5]:
# lemmatize and stop word
lemmatize = True
remove_stopwords = True

if lemmatize:
    lemmatizer = WordNetLemmatizer()
else:
    lemmatizer = None

if remove_stopwords:
    stop_words = set(stopwords.words('english'))
else:
    stop_words = set()

In [6]:
token_pattern = re.compile(r'(?u)\b\w\w+\b')

In [7]:
# taking aspects to be attributes
aspects = attribute_classes
print(aspects)

# a dictionary of aspects and segments associated with that aspect
aspect_segments = dict([(aspect, []) for aspect in aspects])
print(aspect_segments)

# a list for all segments
all_segs = []


['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
{'g': [], 'p': [], 't': [], 'q': [], 's': [], 'h': [], 'c': [], 'll': [], 'or': [], 'l': [], 'av': [], 'e': [], 'a': [], 'pp': []}


In [8]:
# for coarse aspects
# taking coarse aspects to be attributes
coarse_aspects = coarse_attributes_classes
print(coarse_aspects)
# a dictionary of aspects and segments associated with that aspect
coarse_aspect_segments = dict([(aspect, []) for aspect in coarse_aspects])
print(coarse_aspect_segments)


['g', 'p', 'eq', 'sh', 'ts', 'e']
{'g': [], 'p': [], 'eq': [], 'sh': [], 'ts': [], 'e': []}


In [9]:
# labelling scheme of annotated dataset round 3
df_asp = pd.read_excel('../Labeling Workshop_updated_18-10-19.xlsx', sheet_name='Labeling Scheme')

# a dictionary with aspect codes and its labels
asp_meaning = {}
for index, row in df_asp.iterrows():
    if(row[0] == 'attribute'):
        asp_meaning[row[1]] = row[2]
#pprint(asp_meaning, sort_dicts= False)

In [20]:
#for coarse aspects
coarse_asp_meaning={'g':'general', 'p':'price', 'eq':'experienced quality', 'sh':'safety and healthiness','ts':'trustworthy sources','e':'environment'}


In [28]:
%%time
for i, row in df_relevant.iterrows():  
    if row['Attribute']!='NaN':
        seg_body = row['Sentence']
        seg_aspect = row['Attribute']
        seg_words = [word for word in token_pattern.findall(seg_body.lower())
                              if word not in stop_words]

        if lemmatizer is not None:
            seg_words = [lemmatizer.lemmatize(word) for word in seg_words]
        # prepared segment
        seg_prep = ' '.join(seg_words)
        # add to aspect-segment dictionary
        aspect_segments[seg_aspect].append(seg_prep)
        # add to all segments list
        all_segs.append(seg_prep)
print(aspect_segments['a'])


['innate chicken disease prone raised', 'look healthy factory farmed chicken free range chicken factory farmed bird crowded living others defecation', 'achieve number packed tight space', 'certainly better tight cage disease still huge problem nature many chicken would area', 'think meat chicken used killed co2 gas breaking neck let call recently added maceration form euthanasia newly born chicken', 'think people think organic meat automatically humanely raised meat', 'someone grew farm find people awareness modern farming method abysmal', 'example recently came hen certain type free range enviroments actually worse battery hen', 'example recently came hen certain type free range enviroments actually worse battery hen', 'packed shed within legal minimum called free range enough room roam healthily become stressed attack one another', 'another good reason eating organic food ethical reason', 'lastly ethic concerning treatment factory farmed animal', 'many vegitarians vegan could probabl

In [31]:
# for coarse aspects
# add to coarse aspects
coarse_aspect_segments['g']=aspect_segments['g']
coarse_aspect_segments['p']=aspect_segments['p']
coarse_aspect_segments['eq']=aspect_segments['t'] + aspect_segments['q']
coarse_aspect_segments['sh']=aspect_segments['s'] + aspect_segments['h'] + aspect_segments['c']
coarse_aspect_segments['ts']=aspect_segments['ll'] + aspect_segments['or'] + aspect_segments['l'] + aspect_segments['av']
coarse_aspect_segments['e']=aspect_segments['e'] + aspect_segments['a'] + aspect_segments['pp']
print(len(coarse_aspect_segments['sh']))


5256


In [32]:
# compute tfidf scores
vectorizer = TfidfVectorizer(stop_words='english' if remove_stopwords else None,
        norm='l1', use_idf=True)
vectorizer.fit(all_segs)
gl_freq = vectorizer.transform([' '.join(all_segs)]).toarray()[0]
print(len(gl_freq))

5828


In [33]:
# global scores
gl_scores = {}
for term, idx in vectorizer.vocabulary_.items():
    gl_scores[term] = gl_freq[idx]
print (len(gl_scores))


5828


In [21]:
%%time
asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
sorted_asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
score_seed_words = dict([(aspect, []) for aspect in asp_meaning.values()])

for aspect, segments in aspect_segments.items():
    # aspect-specific scores
    asp_freq = vectorizer.transform([' '.join(segments)]).toarray()[0]
    
    # entropies correspond to clarity scores
    entropies = rel_entr(asp_freq, gl_freq) / log(2)
    for term, idx in vectorizer.vocabulary_.items():
        asp_scores[aspect][term] = entropies[idx]
        
    # sort by score and write to sorted_asp_scores if > 0
    scores = sorted(asp_scores[aspect].items(), reverse=True, key=lambda x:x[1])
    for term, cla in scores[0:30]:
        if cla > 0:
            sorted_asp_scores[aspect][term] = cla
            score_seed_words[asp_meaning[aspect]].append(term)

#pprint(sorted_asp_scores, sort_dicts=False)
#pprint(score_seed_words, sort_dicts=False)

df_out = pd.DataFrame.from_dict(data=score_seed_words, orient='index')

if remove_stopwords:
    df_out.to_excel('../processed/score_seed_words_remove_stop.xlsx')
else:
    df_out.to_excel('../processed/score_seed_words.xlsx')

Wall time: 164 ms


In [35]:
#for coarse aspects
asp_scores = dict([(aspect, {}) for aspect in coarse_aspect_segments.keys()])
sorted_asp_scores = dict([(aspect, {}) for aspect in coarse_aspect_segments.keys()])
score_seed_words = dict([(aspect, []) for aspect in coarse_asp_meaning.values()])

for aspect, segments in coarse_aspect_segments.items():
    # aspect-specific scores
    asp_freq = vectorizer.transform([' '.join(segments)]).toarray()[0]

    # entropies correspond to clarity scores
    entropies = rel_entr(asp_freq, gl_freq) / log(2)
    for term, idx in vectorizer.vocabulary_.items():
        asp_scores[aspect][term] = entropies[idx]

    # sort by score and write to sorted_asp_scores if > 0
    scores = sorted(asp_scores[aspect].items(), reverse=True, key=lambda x:x[1])
    for term, cla in scores[0:35]:
        if cla > 0:
            sorted_asp_scores[aspect][term] = cla
            score_seed_words[coarse_asp_meaning[aspect]].append(term)

print(sorted_asp_scores)
print(score_seed_words)

df_out = pd.DataFrame.from_dict(data=score_seed_words, orient='index')

if remove_stopwords:
    df_out.to_excel('../processed/score_seed_words_remove_stop_coarse.xlsx')
else:
    df_out.to_excel('../processed/score_seed_words_coarse.xlsx')

{'g': {'gmo': 0.0034220729016981124, 'farming': 0.003251384551384033, 'store': 0.0026979023506027502, 'organic': 0.0024181960176268683, 'india': 0.002208064794842297, 'definition': 0.002202229252914021, 'online': 0.0018898598216810533, 'farm': 0.0018530984385520955, 'seed': 0.0018348630535307273, 'start': 0.0016768723632866682, 'best': 0.001656168123002347, 'dairy': 0.0016352527818798814, 'gmos': 0.0015955780189251085, 'cent': 0.0015725143114748688, 'egg': 0.001529844604986125, 'big': 0.0014552920117768364, 'farmer': 0.0013613723033514854, 'thing': 0.0013335629401660326, 'purchase': 0.0013257227395019554, 'market': 0.0013242001489428316, 'soybean': 0.0013223493838421954, 'monsanto': 0.0012629831189050185, 'buy': 0.0012507066326750212, 'dollar': 0.0012476833126939882, 'item': 0.001236022971720407, 'soy': 0.001224503650968605, 'like': 0.0011994535209259602, 'agriculture': 0.0011941462297654613, 'product': 0.001191512495915708, 'genetic': 0.0011886119434208677, 'method': 0.001175152824786