# NRC Hashtag Emotion Corpus
## Tests for SCP (Cohestion & Accommodation), LSM, LLR, and Stats tests

### Generate emotion tags for Tweets

In [5]:
from collections import OrderedDict, defaultdict,Counter
import pandas as pd
import sys
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import seaborn as sns 
from collections import Counter
from scipy import stats

import random

import SCP_Code as scp

import NRCHash as nrc

In [6]:
tweetData = pd.read_csv('pairs_edit_utf_noSquig.csv', sep='~',)

In [7]:
tagdataA = pd.read_csv('A_Tweets_Tagged.txt', sep='\t', header=None)
tagdataA.columns = ['Tokens', 'Tags', 'Conf','Raw Tweet']
tagdataB = pd.read_csv('B_Tweets_Tagged.txt', sep='\t', header=None)
tagdataB.columns = ['Tokens', 'Tags', 'Conf','Raw Tweet']

In [11]:
# Read NRC Hashtag Emotion Lexicon into lists:
wordList = defaultdict(list)
emotionList = defaultdict(list)
#with open ('C:/Users/Harper/Documents/Practice for Dissertation/NRC-Sentiment-Emotion-Lexicons/AutomaticallyGeneratedLexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/NRC-Hashtag-Emotion-Lexicon-v0.2.txt','r') as f:
with open ('/its/home/kh414/Documents/Dissertation/Code_To_Parse/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/AutomaticallyGeneratedLexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/NRC-Hashtag-Emotion-Lexicon-v0.2.txt','r') as f:
    reader = csv.reader(f, delimiter = '\t')
    headerRows = [i for i in range (0,46)] # why 46?
    for row in headerRows:
        next(reader)
    for Affect, word, score in reader: # <AffectCategory><tab><term><tab><score>
        # The higher the value, the stronger is the association. 
        if float(score) > 0.1: # Automatically filter out values less than 0.1. 
            wordList[word].append(Affect)
            emotionList[Affect].append(word) 

In [12]:
# Function to generate emotion count using the word list generated by the above lexicon. 
def generateEmotionCount(string):
    emoCount = Counter()
    for token in string.split():
        token = token.lower()
        emoCount += Counter(wordList[token])
    return emoCount

In [13]:
# Fill empty tokens with null word without emotion
tagdataA.fillna({'Tokens':'null'}, inplace=True)
tagdataB.fillna({'Tokens':'null'}, inplace=True)

emotionCountsA = [generateEmotionCount(tweet) for tweet in tagdataA.Tokens]
emotionCountsB = [generateEmotionCount(tweet) for tweet in tagdataB.Tokens]

In [None]:
nanTestA = pd.DataFrame(emotionCountsA)
nanTestB = pd.DataFrame(emotionCountsB)

In [None]:
# Calculate null values in this lexicon
asum = nanTestA.isnull().sum()
bsum = nanTestB.isnull().sum()

In [15]:
emotionCountsADF = pd.DataFrame(emotionCountsA, index = tagdataA.index)
emotionCountsBDF = pd.DataFrame(emotionCountsB, index = tagdataB.index)
emotionCountsADF.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,5.0,5.0,,5.0,4.0,1.0,7.0,1.0
1,5.0,5.0,,5.0,4.0,1.0,7.0,1.0
2,6.0,1.0,2.0,1.0,,1.0,,2.0
3,,1.0,2.0,1.0,,1.0,,3.0
4,3.0,5.0,2.0,4.0,1.0,2.0,4.0,2.0


In [25]:
#Drop last in emotioncountsA and tagdataA
tagdataA= tagdataA.drop(tagdataA.index[len(tagdataA)-1])
emotionCountsADF = emotionCountsADF.drop(emotionCountsADF.index[len(emotionCountsADF)-1])
len(emotionCountsADF)

In [17]:
from sklearn.utils import shuffle

shufBDF = shuffle(emotionCountsBDF).reset_index(drop=True)
shufBDF.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,3.0,5.0,4.0,4.0,2.0,,1.0,2.0
1,2.0,1.0,3.0,1.0,,,3.0,1.0
2,1.0,1.0,,2.0,1.0,3.0,,1.0
3,3.0,3.0,3.0,1.0,1.0,2.0,3.0,3.0
4,1.0,,1.0,,3.0,1.0,5.0,


In [18]:
markers = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']

In [19]:
# Function to get the shared markers between tweet-reply messages. 
def getsharedNRCSentiment(markers,emotionA,emotionB):
    score = []
    count = Counter()
    for item in range(len(emotionA)):
        temp = []
        for mark in markers:
            if(emotionA[mark][item] > 1 and emotionB[mark][item] > 1):  # Filter noisy sentiment (low value)
                temp.append(mark)
        score.append(temp)
    return score

In [20]:
sharedHash = getsharedNRCSentiment(markers,emotionCountsADF,emotionCountsBDF)
print(sharedHash[0:5])

[['anger', 'anticipation', 'fear', 'joy'], ['anger'], [], [], ['fear', 'surprise']]


In [21]:
sharedHashRand = getsharedNRCSentiment(markers,emotionCountsADF,shufBDF)
print(sharedHashRand[0:5])

[['anger', 'anticipation', 'fear', 'joy'], ['anger', 'surprise'], [], ['disgust', 'trust'], ['surprise']]


### Get natural and random shared counts for LLR analysis

In [None]:
## Flatten shared lists:
flat_share = [item for sublist in sharedHash for item in sublist]
flat_shuff = [item for sublist in sharedHashRand for item in sublist]
print(len(flat_share),len(flat_shuff))

In [None]:
counterShared = Counter(flat_share)
counterRandShuf = Counter(flat_shuff)

print(counterShared)
print('\n',counterRandShuf)

In [None]:
#Generate frequency list for LLR analysis

countershareddf = pd.DataFrame.from_dict(counterShared,orient='index')
dtemp = pd.DataFrame.from_dict(counterRandShuf,orient='index')
dtemp.columns = ['Shared Rand']
countershareddf.columns = ['Shared']
countershareddf['Rand Shared'] = dtemp['Shared Rand']

countershareddf['Difference'] = countershareddf['Shared'] - countershareddf['Rand Shared']
countershareddf

In [None]:
print(countershareddf.sum())

with open ('SharedFreq.tex','w') as tf:
    tf.write(countershareddf.to_latex())

# SCP

In [22]:
SCP_ScoresNRC = scp.CalculateAllCohesion(markers,sharedHash,sharedHashRand)
SCP_ScoresNRC

{'anger': [0.2270217403967906, 0.19553480272375806, 0.03148693767303254],
 'anticipation': [0.18988125714741222,
  0.16013269276168343,
  0.029748564385728787],
 'disgust': [0.18535903386915292, 0.1513611125589039, 0.033997921310249035],
 'fear': [0.12885423724656386, 0.10179447951509503, 0.027059757731468834],
 'joy': [0.1376411470197782, 0.10891966385317951, 0.028721483166598685],
 'sadness': [0.055302958300502504, 0.04272964462989818, 0.012573313670604325],
 'surprise': [0.2760518384753822, 0.2485168640604353, 0.027534974414946928],
 'trust': [0.0998414922417044, 0.08033001499231988, 0.019511477249384512]}

In [None]:
CohesionDF = pd.DataFrame.from_dict(SCP_ScoresNRC,orient = 'index')
CohesionDF.columns = ['TweetReply Cohesion', 'RandomReply Cohesion','Subtractive Cohesion Score']
CohesionDF.head()

In [None]:
# Write dataframe to latex
with open ('CohesionTable_NRCHashSent_SCP.tex','w') as tf:
    tf.write(CohesionDF.to_latex())

In [None]:
plot1 = CohesionDF['Subtractive Cohesion Score'].sort_values().plot(kind = 'barh',legend = False, color = 'steelblue')#,figsize = (20,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.xlabel('SCP Score')
plt.ylabel('Sentiment Marker')
plt.title('Subtractive Global Alignment NRC Emotion')
plt.savefig('SubtractiveGlobalAlignment_SentimentNRC')

# LSM

In [None]:
def CalculateLSM(markers,TweetTagsA,TweetTagsB):
    allLSM = {}
    allTweets = len(TweetTagsA) # total number of tweets
    for mark in markers: 
        pA = (countMarkers2(mark,TweetTagsA)/allTweets)
        pB = (countMarkers2(mark,TweetTagsB)/allTweets)
        numerator = abs(pA-pB)
        denominator = pA + pB
        LSMScore = (1 - (numerator/denominator))
        allLSM[mark] = [pA,pB,LSMScore]
    return allLSM

In [None]:
LSM_Sentiment = CalculateLSM(markers, emotionCountsADF,emotionCountsBDF)
LSM_Sentiment

In [None]:
#Function to iterate through a NRC dataframe of emotion counts per message and return a tally if the value 
# is greater than 0.
def countMarkers2(marker, message_set):
    tally = 0
    for countItem in message_set[marker]:
        if countItem > 1:
            tally+=1
    return tally
countMarkers2('anger',emotionCountsADF[0:5])

In [None]:
LSMdf = pd.DataFrame.from_dict(LSM_Sentiment,orient = 'index')
LSMdf.columns = ['pA', 'pB','LSM Score']
LSMdf

In [None]:
# Write dataframe to latex
with open ('LSMTable_SentimentNRCHash.tex','w') as tf:
    tf.write(LSMdf.to_latex())

In [None]:
# Visualize dataframe to horizontal bar chart
plot1 = LSMdf['LSM Score'].sort_values().plot(kind = 'barh',legend = False, color = 'steelblue')#,figsize = (20,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.xlabel('LSM Score')
plt.xlim(0.95,1.005)
plt.ylabel('Sentiment')
plt.title('Linguistic Style Matching Sentiment - NRC Hashtag')
plt.savefig('LSM_Sentiment_NRCHash')

# LLR

In [None]:
import FunctionsForLLR as llr

In [None]:
# Random: sharedRandSentAnB
LLR_Sent = llr.getLLR(emotionCountsA,emotionCountsB,sharedHash,markers)
LLR_Sent

In [None]:
LLRSentDF = pd.DataFrame.from_dict(LLR_Sent, orient='index')
LLRSentDF.columns = ['LLR Scores Sentiment']
LLRSentDF

In [None]:
# Visualize dataframe to horizontal bar chart
plot1 = LLRSentDF['LLR Scores Sentiment'].sort_values().plot(kind = 'barh', color = 'SteelBlue')#,figsize = (20,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.xlabel('LLR Score')
plt.ylabel('Sentiment Marker')
plt.title('LLR Score for Sentiment Markers')
plt.savefig('LLRScores_Sentiment_NRCHash')

In [None]:
# Scramble replies in countdictB for random 

shuffledCountDictB = random.sample(emotionCountsB,len(emotionCountsB))
print(shuffledCountDictB[0:5],'\n',emotionCountsB[0:5])

In [None]:
sharedNew = scp.getSharedMarkers(shuffledCountDictB,emotionCountsB)

In [None]:
# Random: sharedRandSentAnB
LLR_Sent_Rand = llr.getLLR(emotionCountsA,emotionCountsB,sharedNew,markers)
LLR_Sent_Rand

In [None]:
LLRSentDFR = pd.DataFrame.from_dict(LLR_Sent_Rand, orient='index')
LLRSentDFR.columns = ['LLR Scores Sentiment Random']
LLRSentDFR

In [None]:
# Visualize dataframe to horizontal bar chart
plot1 = LLRSentDFR['LLR Scores Sentiment Random'].sort_values().plot(kind = 'barh', color = 'SteelBlue')#,figsize = (20,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.xlabel('LLR Score')
plt.ylabel('Sentiment Marker')
plt.title('LLR Score for Baseline Sentiment Markers')
plt.savefig('LLRScores_Sentiment_NRCHash_Baseline')

In [None]:
# Write dataframe to latex
with open ('LLR_Sentiment_NRC_Hash_Rand.tex','w') as tf:
    tf.write(LLRSentDFR.to_latex())
with open ('LLR_Sentiment_NRC_Hash.tex','w') as tf2:
    tf2.write(LLRSentDF.to_latex())

# PScore for NRC Emotion Markers

In [None]:
#del(emotionCountsA[-1])
print(len(emotionCountsA),len(emotionCountsB))

In [None]:
# Use fishers to find statistical significance of pos tag results:

fishScore = scp.getFishersPVal(emotionCountsA,emotionCountsB,sharedNew,markers)
fishScore

In [None]:
# Convert to dataframes
fishScoreDF = pd.DataFrame.from_dict(fishScore, orient='index')
fishScoreDF.columns = ['P-Score Sent Tags']

In [None]:
with open ('P_Score_NRCHash.tex','w') as tf:
    tf.write(fishScoreDF.to_latex())

In [None]:
fishScoreDF

# Accommodation for Sentiment Markers

In [27]:
# Create single dictionary for A-tweet emotions and b-tweet emotions
EmotDict = {}
listEmotA = []
for item in emotionCountsA:
    tempA = []
    for value in item:
        tempA.append(value)
    listEmotA.append(tempA)
EmotDict['ASent'] = listEmotA

listEmotB = []
for item in emotionCountsB:
    tempA = []
    for value in item:
        tempA.append(value)
    listEmotB.append(tempA)
EmotDict['BSent'] = listEmotB

In [None]:
EmotDF = pd.DataFrame.from_dict(EmotDict)
EmotDF.head()

In [None]:
# Generate Dataframe with Username Pairs, and append that to the sentiment dataframe
pairs = []
for i in range(len(tweetData)):
    pairs.append(tuple([tweetData.a_username[i],tweetData.b_username[i]]))

UNPairs = pd.DataFrame()
UNPairs['Pairs'] = pairs
UNPairs.head()

In [None]:
EmotDF['Pairs'] = UNPairs['Pairs']

count = Counter(EmotDF['Pairs'])
print('UN Count Length:',len(count)) # generates count of all the usernames. len should be 69148

# Drop the usernames from DF with fewer than 10 instances in the conversation
for k in list(count):
    if count[k] < 10: # Delete tweet-reply username pairs with fewer than 10 messages
        del count[k]
print('Dropped Count Length: ',len(count)) # Should be 7392

#Turn Counter dictionary into list:
countList = []
for item in list(count):
    countList.append(item)
countList[0:5]

In [None]:
# Drop rows from dataframe that aren't consistent with usernames in Countlist - with sufficient number ot analyze
newSentDF = EmotDF[EmotDF['Pairs'].isin(countList)]
newSentDF = newSentDF.reset_index()
newSentDF[0:20]

In [32]:
# Call on below function with sentiment markers: 
#markers = ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']
shared = scp.getSharedMarkers(newSentDF['ASent'],newSentDF['BSent'])
len(shared)

174857

In [None]:
allScores = nrc.fullForm(countList,newSentDF,markers,shared) # input: (UNList,Dataframe,markers,sharedlist)
len(allScores)

In [None]:
#Write df to CSV
allScoresDF = pd.DataFrame.from_dict(allScores,orient='index')
allScoresDF.to_csv('SCP_Acc_Scores_SentHash.csv',sep = '~')

In [None]:
allScoresDF.head()

In [None]:
meanscol = allScoresDF.mean(axis = 0)
meanscolDF = meanscol.to_frame()#.reset_index()
meanscolDF.columns=['Score']
meanscolDF=meanscolDF.reset_index()

In [None]:
meanscolDF.head()

In [None]:
# Calculate average difference between minuend and subtrahend estimations
scoreDict = {}
for i in range(len(meanscolDF)):
    if i%2 == 1:
        scoreDict[meanscolDF['index'][i-1]]= (meanscolDF['Score'][i] - meanscolDF['Score'][i-1]) 
scoreDict

In [None]:
# Calculate average difference between minuend and subtrahend estimations for
ScoreListBAvg = {}
ScoreListBGivA = {}
for i in range(len(meanscolDF)):
    if i%2 == 0:
        ScoreListBAvg[meanscolDF['index'][i]] = meanscolDF['Score'][i]
    elif i%2 == 1:
        ScoreListBGivA[meanscolDF['index'][i-1]] = meanscolDF['Score'][i]
len(ScoreListBGivA)

In [None]:
scoreDictDF = pd.DataFrame.from_dict(scoreDict, orient='index')
scoreDictDF.columns = ['Accommodation']
scoreDictDF.head()

In [None]:
ScoreListBGivA = pd.DataFrame.from_dict(ScoreListBGivA,orient='index')
ScoreListBAvg = pd.DataFrame.from_dict(ScoreListBAvg,orient='index')
ScoreListBAvg.columns = ['B']
ScoreListBGivA.columns = ['B|A']
scoreDictDF['BAvg'] = ScoreListBAvg['B']
scoreDictDF['BGivenA'] = ScoreListBGivA['B|A']
scoreDictDF.head()

In [None]:
scoreToPlotBar = scoreDictDF.copy()
scoreToPlotBar = scoreToPlotBar.drop(['Accommodation'], axis = 1)

In [None]:
# Create Visualizations and save tables
with open ('SCP_SentHash_Accomodation.tex','w') as tf:
    tf.write(scoreDictDF.to_latex())

In [None]:
# Visualize dataframe to horizontal bar chart
plot1 = scoreToPlotBar.plot(kind = 'bar')#,figsize = (15,7))#, stacked = True)#,figsize = (20,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.ylabel('SCP Score')
plt.xlabel('Sentiment Marker')
plt.title('SCP - Accommodation')
plt.savefig('Accommodation_AllVals_NRCHashSent')

In [None]:
# Visualize just accommodation score
# Visualize dataframe to horizontal bar chart
plot1 = scoreDictDF['Accommodation'].sort_values().plot(kind = 'barh',color='steelblue')#,figsize = (15,10))
                    
#plot1.tick_params(labelbottom=False, bottom=False,which='both')
plt.xlabel('SCP Score')
plt.ylabel('Emotion Marker')
plt.title('SCP - Accommodation')
plt.savefig('Accommodation_Diff_Score_NRCHashSent')

## 2 Tailed P Test for sentiment

In [None]:
TTestDict = {}
for marker in markers:
    TTestDict[marker] = (stats.ttest_rel(allScoresDF[marker],allScoresDF[marker+'2']))
TTestDict

In [None]:
allScoresDF.fillna(0.0000001, inplace=True)
allScoresDF.fillna(0.0000001, inplace=True)

In [None]:
TTestDF = pd.DataFrame.from_dict(TTestDict,orient='index')
TTestDF =TTestDF.drop('statistic',axis=1)
TTestDF.head()

In [None]:
with open ('Accom_TTest_Emot.tex','w') as tf:
    tf.write(TTestDF.to_latex())