# Clarity Scoring Function


In [1]:
import sys
import re
# import os.path
from os import makedirs
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from numpy import log
from scipy.special import rel_entr
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
header_columns = 'Author_ID|Author_name|Comment_number|Sentence_number|Domain_Relevance|Sentiment|Entity|Attribute|Sentence|Source_file|Annotator|Aspect'.split('|')
# domain_relevance_classes = ['0', '9']
# entity_classes = ['g', 'p', 'f', 'c', 'cg', 'cp', 'cf', 'cc', 'gg']
attribute_classes = ['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
# sentiment_classes = ['p', 'n', '0']
# print(len(attribute_classes) * len(entity_classes))

In [4]:
%%time
df = pd.read_csv('../dataframe.csv', sep='|', names = header_columns)
print(len(df))
df.sample(5)

8824
Wall time: 177 ms


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
6551,John-Burgess-1,John Burgess,619,6,0,,,,Are you going to start with local distribution?,quora.json,sumit,nan-nan
894,Joshua-Engel,Joshua Engel,511,2,0,,,,"that is actually the origin of the name ""organ...",quora.json,fahad,nan-nan
786,Mike-Hatchet,Mike Hatchet,1910,7,0,,,,​Now of course if you have just made pasta sa...,quora.json,abilasha,nan-nan
8347,Craig-Good,Craig Good,57,3,0,,,,It probably means you have a specific deficiency.,quora.json,hannah,nan-nan
4136,Ryan-Carlyle,Ryan Carlyle,758,3,0,,,,I agree that it is worthy of concern.,quora.json,sarthak,nan-nan


In [5]:
# taking only relvant
df_relevant = df.loc[df['Domain_Relevance'] == '9']
print(len(df_relevant))
df_relevant.sample(5)

4687


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
4288,Timothy-Sly,Timothy Sly,16,3,9,n,p,g,Numerous studies have attempted to find whethe...,quora.json,hannah,p-g
2112,Helena-Roman,Helena Roman,1044,10,9,n,g,g,Organic is not always better.,quora.json,sebastian,g-g
496,Sutrisno-Sukendy,Sutrisno Sukendy,1557,7,9,p,cf,t,"However if you go to farmers market, you will ...",quora.json,omar,cf-t
7987,R-J-Rangel,R.J. Rangel,799,2,9,0,p,g,next time or two local grocery right to know w...,quora.json,kamal,p-g
8018,Benjamin-Weingarten-2,Benjamin Weingarten,1848,1,9,p,p,g,"According To me, organic food is better then n...",quora.json,abilasha,p-g


In [16]:
# lemmatize and stop word
lemmatize = True
remove_stopwords = True

if lemmatize:
    lemmatizer = WordNetLemmatizer()
else:
    lemmatizer = None

if remove_stopwords:
    stop_words = set(stopwords.words('english'))
else:
    stop_words = set()

In [17]:
token_pattern = re.compile(r'(?u)\b\w\w+\b')

In [18]:
# taking aspects to be attributes
aspects = attribute_classes
print(aspects)

# a dictionary of aspects and segments associated with that aspect
aspect_segments = dict([(aspect, []) for aspect in aspects])
print(aspect_segments)

# a list for all segments
all_segs = []

['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
{'g': [], 'p': [], 't': [], 'q': [], 's': [], 'h': [], 'c': [], 'll': [], 'or': [], 'l': [], 'av': [], 'e': [], 'a': [], 'pp': []}


In [19]:
# labelling scheme of annotated dataset round 3
df_asp = pd.read_excel('../Labeling Workshop_updated_18-10-19.xlsx', sheet_name='Labeling Scheme')

# a dictionary with aspect codes and its labels
asp_meaning = {}
for index, row in df_asp.iterrows():
    if(row[0] == 'attribute'):
        asp_meaning[row[1]] = row[2]
pprint(asp_meaning, sort_dicts= False)

{'g': 'general',
 'p': 'price',
 't': 'taste',
 'q': 'nutritional quality/freshness/appearance',
 's': 'safety',
 'h': 'healthiness',
 'c': 'chemicals/pesticides',
 'll': 'label',
 'or': 'origin, source',
 'l': 'local',
 'av': 'availability',
 'e': 'environment',
 'a': 'animal welfare',
 'pp': 'productivity'}


In [20]:
%%time
for i, row in df_relevant.iterrows():  
    if row['Attribute']!='NaN':
        seg_body = row['Sentence']
        seg_aspect = row['Attribute']
        seg_words = [word for word in token_pattern.findall(seg_body.lower())
                              if word not in stop_words]

        if lemmatizer is not None:
            seg_words = [lemmatizer.lemmatize(word) for word in seg_words]
        # prepared segment
        seg_prep = ' '.join(seg_words)
        # add to aspect-segment dictionary
        aspect_segments[seg_aspect].append(seg_prep)
        # add to all segments list
        all_segs.append(seg_prep)
print(len(all_segs))

4687
Wall time: 2.69 s


In [21]:
# compute tfidf scores
vectorizer = TfidfVectorizer(stop_words='english' if remove_stopwords else None,
        norm='l1', use_idf=True)
vectorizer.fit(all_segs)
gl_freq = vectorizer.transform([' '.join(all_segs)]).toarray()[0]
print(len(gl_freq))

6115


In [22]:
# global scores
gl_scores = {}
for term, idx in vectorizer.vocabulary_.items():
    gl_scores[term] = gl_freq[idx]
print (len(gl_scores))


6115


In [23]:
%%time
asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
sorted_asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
score_seed_words = dict([(aspect, []) for aspect in asp_meaning.values()])

for aspect, segments in aspect_segments.items():
    # aspect-specific scores
    asp_freq = vectorizer.transform([' '.join(segments)]).toarray()[0]
    
    # entropies correspond to clarity scores
    entropies = rel_entr(asp_freq, gl_freq) / log(2)
    for term, idx in vectorizer.vocabulary_.items():
        asp_scores[aspect][term] = entropies[idx]
        
    # sort by score and write to sorted_asp_scores if > 0
    scores = sorted(asp_scores[aspect].items(), reverse=True, key=lambda x:x[1])
    for term, cla in scores[0:10]:
        if cla > 0:
            sorted_asp_scores[aspect][term] = cla
            score_seed_words[asp_meaning[aspect]].append(term)

pprint(sorted_asp_scores, sort_dicts=False)
pprint(score_seed_words, sort_dicts=False)

df_out = pd.DataFrame.from_dict(data=score_seed_words, orient='index')

if remove_stopwords:
    df_out.to_excel('../Clarity_Score/score_seed_words_remove_stop.xlsx')
else:
    df_out.to_excel('../Clarity_Score/score_seed_words.xlsx')

{'g': {'gmo': 0.0022817229820224186,
       'farming': 0.002175556486316003,
       'store': 0.0017976296114928577,
       'organic': 0.001658118009321377,
       'all': 0.0015374485627084038,
       'india': 0.0014687891871238948,
       'definition': 0.0014616690768175908,
       'online': 0.0012560642721005564,
       'farm': 0.0012399567081179926,
       'seed': 0.0012181513580840782},
 'p': {'price': 0.043783691617119286,
       'expensive': 0.0407781899965861,
       'cost': 0.035727172565851104,
       'more': 0.019353661511197193,
       'pay': 0.014380946836831218,
       'money': 0.01319341640336146,
       'demand': 0.012557539427471027,
       'higher': 0.011392692648171767,
       'extra': 0.01132344073846946,
       'charge': 0.008982554383024737},
 't': {'taste': 0.17097895060671403,
       'better': 0.04489147647094243,
       'flavor': 0.032051047848199524,
       'difference': 0.024307562741049275,
       'tasty': 0.01436941325033943,
       'delicious': 0.01284895112

Wall time: 963 ms
