In [79]:
import pandas as pd
import plotly.express as px
import os
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentimentr.sentimentr import Sentiment as sentimentr
from datetime import datetime as dt
from nltk import tokenize
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def leaderFuzzySearch(sentence, name):
    top_set_match = process.extract(sentence, name, scorer=fuzz.token_set_ratio)

    for alias in top_set_match:
        print(alias)
        if alias[1] >= 90:
            return True
        else:
            continue
    return False

leader_of_interest = "gillard"
date_format = '%d-%b-%y'
custom_date_parser = lambda x: dt.strptime(x, date_format)

print(os.getcwd())
resolved_df = pd.read_csv('../02_Coreference_Resolution/leader_resolved/gillard_resolved_ccode-900_leadid-A30-218_temp.csv')


resolved_df['article_id'] = resolved_df.reset_index().index
print(len(resolved_df))



sentences_df  = pd.DataFrame([], columns=['article_id', 'sentence'])
for row in resolved_df.index.tolist():
    id = resolved_df.loc[row, 'article_id']
    resolved_text = resolved_df.loc[row, 'resolved_text']
    sentence_list = tokenize.sent_tokenize(resolved_text)
    sentence_df = pd.DataFrame({"sentence" : sentence_list})
    sentence_df['article_id'] = id

    sentences_df = pd.concat([sentences_df, sentence_df], ignore_index=True)

resolved_df = pd.merge(resolved_df, sentences_df, how='left', on='article_id')
resolved_df['leader_sentence_dummy'] = resolved_df.apply(lambda row: "GILLARD" if leader_of_interest in row.sentence.lower() else "OTHER", axis=1)


analyzer = SentimentIntensityAnalyzer()

print(resolved_df['leader_sentence_dummy'].unique())
# print(resolved_df.tail(5))

resolved_df['vader_comp_score'] = resolved_df['sentence'].apply(lambda row: analyzer.polarity_scores(row)['compound'])
resolved_df['sentimentr_comp_score'] = resolved_df['sentence'].apply(lambda row: sentimentr.get_polarity_score(row))
# DIFFERENCE = VADER - SENTIMENTR 
resolved_df['vader_sentimentr_diff'] = resolved_df.vader_comp_score - resolved_df.sentimentr_comp_score 

print(resolved_df.tail(5))


c:\Users\Joshualevy\Documents\leaderSentimentEarnings\03_Sentiment_Analysis
80
['OTHER' 'GILLARD']
            date                                               link  \
3292  2013-10-19  https://www-proquest-com.proxy.uchicago.edu/ab...   
3293  2013-10-19  https://www-proquest-com.proxy.uchicago.edu/ab...   
3294  2013-10-19  https://www-proquest-com.proxy.uchicago.edu/ab...   
3295  2013-10-19  https://www-proquest-com.proxy.uchicago.edu/ab...   
3296  2013-10-19  https://www-proquest-com.proxy.uchicago.edu/ab...   

                                                   text  \
3292  The Labor Party licks its wounds and chooses a...   
3293  The Labor Party licks its wounds and chooses a...   
3294  The Labor Party licks its wounds and chooses a...   
3295  The Labor Party licks its wounds and chooses a...   
3296  The Labor Party licks its wounds and chooses a...   

                                          resolved_text  ccode    country  \
3292  The Labor Party licks its wounds and

In [6]:
resolved_df['date'] = pd.to_datetime(resolved_df['date'])

In [132]:
def downweight_zero_mean(x):
    print(x)
    dropped = [num for num in x if pd.isnull(num)==False]
    total_sum = sum(dropped)
    zeros = len([num for num in dropped if num==0])
    non_zeros = len([num for num in dropped if num!=0])
    print(zeros)
    if zeros == 0:
        downweighted_mean = total_sum /(non_zeros)
        return downweighted_mean
    else:
        try:
            downweighted_mean = total_sum /(non_zeros + math.sqrt(math.log(zeros))) 
            return downweighted_mean
        except:
            downweighted_mean = total_sum / len(dropped)
            return downweighted_mean

# print(downweight_zero_mean([1,2,0,0,0, pd.NA, 3, 49, 0, 0]))

In [None]:

fig = px.scatter(resolved_df, x="date", y="vader_comp_score", color="leader_sentence_dummy", trendline='lowess', trendline_options=dict(frac=1), hover_data=['sentence'])

fig.show()





In [133]:
entity_level_df = resolved_df.groupby(by=['article_id', 'leader_sentence_dummy', 'date'])['vader_comp_score', 'sentimentr_comp_score'].agg(pd.Series.tolist)
# entity_level_df.reset_index(inplace=True)
entity_level_df.reset_index(inplace=True)
entity_level_df['vader_comp_score'] = entity_level_df['vader_comp_score'].apply(lambda x: downweight_zero_mean(x))
entity_level_df['sentimentr_comp_score'] = entity_level_df['sentimentr_comp_score'].apply(lambda x: downweight_zero_mean(x))
entity_level_df['vader_sentimentr_diff'] = entity_level_df.vader_comp_score - entity_level_df.sentimentr_comp_score

entity_level_df
# print(entity_level_df['vader_comp_score'][0])

[0.0, 0.5267]
1
[0.0516, 0.0, -0.6858, 0.0, 0.2023, -0.0516, 0.5719, 0.0, 0.0, 0.5719, 0.3182, -0.8126, -0.8442, 0.0, -0.2023, 0.0, -0.836, 0.3612, 0.296, -0.296, -0.6705, -0.8074, 0.0, 0.0, -0.9022, 0.8658, 0.4404, 0.0, 0.4939, 0.0772, -0.6486, 0.6808, 0.0, -0.3612, -0.7717, -0.296, -0.3612, 0.5719, 0.128, 0.4576, 0.1531, -0.7717, 0.0, -0.5423, 0.0, -0.4767, 0.4215, -0.6124]
12
[0.0, 0.0, 0.1027, 0.0, -0.8316, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1531, 0.0, 0.0772, -0.4019, 0.0, -0.4939, 0.6597, 0.5106, 0.6705, 0.6486]
10
[0.7778, 0.0, -0.4588, 0.0, -0.6369, 0.128, 0.0, 0.3818, 0.802, 0.0258, 0.5423, 0.3561, -0.4767, 0.4767, -0.3612, 0.6059, -0.296, -0.25, -0.4588, -0.8271, 0.7783, 0.6249, 0.0258, 0.0, -0.7351, 0.0, 0.0, -0.296, -0.4215, 0.0]
7
[0.765, 0.5267, 0.6597, 0.4767, -0.0387, 0.0, 0.4019, 0.25, 0.0, 0.4019, -0.25, 0.0, -0.6216, -0.802, -0.4588]
3
[0.0, 0.6124, -0.296, 0.0, -0.0772, -0.2263, -0.0422, 0.3612, 0.0, 0.296, 0.128, -0.4404, 0.0, -0.296]
4
[0.0, 0.4019]
1
[0.0, 0.2732, 0.0, -


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,article_id,leader_sentence_dummy,date,vader_comp_score,sentimentr_comp_score,vader_sentimentr_diff
0,0,GILLARD,2010-06-19,0.526700,0.064900,0.461800
1,0,OTHER,2010-06-19,-0.114090,-0.035961,-0.078129
2,1,GILLARD,2010-06-19,0.095073,-0.033623,0.128696
3,1,OTHER,2010-06-19,0.012597,-0.029260,0.041857
4,2,GILLARD,2010-07-10,0.100459,-0.046795,0.147254
...,...,...,...,...,...,...
155,77,OTHER,2013-08-31,0.160264,0.018579,0.141685
156,78,GILLARD,2013-08-31,0.067425,-0.003225,0.070650
157,78,OTHER,2013-08-31,0.134596,0.001756,0.132840
158,79,GILLARD,2013-10-19,0.087100,0.065875,0.021225


In [134]:

fig = px.scatter(entity_level_df[entity_level_df.leader_sentence_dummy=='GILLARD'], x="date", y="vader_sentimentr_diff", color="leader_sentence_dummy")

fig.show()





In [50]:
fig = px.scatter(resolved_df, x='vader_comp_score', y='sentimentr_comp_score', range_x=[-1,1], range_y=[-1,1], trendline="ols")
fig.add_shape(type="line", xref="paper", yref="paper",
    x0=0, x1=1,
    y0=0, y1=1,
    line = dict(
        color="DarkOrange",
        width=3))
fig.show()

In [135]:
fig = px.scatter(entity_level_df, x='vader_comp_score', y='sentimentr_comp_score', range_x=[-1,1], range_y=[-1,1], trendline="ols")
fig.add_shape(type="line", xref="paper", yref="paper",
    x0=0, x1=1,
    y0=0, y1=1,
    line = dict(
        color="DarkOrange",
        width=3))
fig.show()