In [None]:
import pandas as pd
import plotly.express as px
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentimentr.sentimentr import Sentiment as sentimentr
from datetime import datetime as dt
from nltk import tokenize
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def leaderFuzzySearch(sentence, name):
    top_set_match = process.extract(sentence, name, scorer=fuzz.token_set_ratio)

    for alias in top_set_match:
        print(alias)
        if alias[1] >= 90:
            return True
        else:
            continue
    return False

leader_of_interest = "gillard"
date_format = '%d-%b-%y'
custom_date_parser = lambda x: dt.strptime(x, date_format)

print(os.getcwd())
resolved_df = pd.read_csv('../02_Coreference_Resolution/leader_resolved/gillard_resolved_ccode-900_leadid-A30-218_temp.csv')


resolved_df['article_id'] = resolved_df.reset_index().index
print(len(resolved_df))



sentences_df  = pd.DataFrame([], columns=['article_id', 'sentence'])
for row in resolved_df.index.tolist():
    id = resolved_df.loc[row, 'article_id']
    resolved_text = resolved_df.loc[row, 'resolved_text']
    sentence_list = tokenize.sent_tokenize(resolved_text)
    sentence_df = pd.DataFrame({"sentence" : sentence_list})
    sentence_df['article_id'] = id

    sentences_df = pd.concat([sentences_df, sentence_df], ignore_index=True)

resolved_df = pd.merge(resolved_df, sentences_df, how='left', on='article_id')
resolved_df['leader_sentence_dummy'] = resolved_df.apply(lambda row: "GILLARD" if leader_of_interest in row.sentence.lower() else "OTHER", axis=1)


analyzer = SentimentIntensityAnalyzer()

print(resolved_df['leader_sentence_dummy'].unique())
# print(resolved_df.tail(5))

resolved_df['vader_comp_score'] = resolved_df['sentence'].apply(lambda row: analyzer.polarity_scores(row)['compound'])
resolved_df['sentimentr_comp_score'] = resolved_df['sentence'].apply(lambda row: sentimentr.get_polarity_score(row))
# DIFFERENCE = VADER - SENTIMENTR 
resolved_df['vader_sentimentr_diff'] = resolved_df.vader_comp_score - resolved_df.sentimentr_comp_score 


print(resolved_df.tail(5))


In [6]:
resolved_df['date'] = pd.to_datetime(resolved_df['date'])

In [None]:

fig = px.scatter(resolved_df, x="date", y="vader_sentimentr_diff", color="leader_sentence_dummy", trendline='lowess', trendline_options=dict(frac=1))

fig.show()





In [29]:
entity_level_df = resolved_df.groupby(by=['article_id', 'leader_sentence_dummy', 'date']).mean()
entity_level_df.reset_index(inplace=True)
entity_level_df

Unnamed: 0,article_id,leader_sentence_dummy,date,ccode,coreference_resolved_ind,number_spans_replaced,gillard,vader_comp_score,sentimentr_comp_score,vader_sentimentr_diff
0,0,GILLARD,2010-06-19,900,True,2,True,0.263350,0.032450,0.230900
1,0,OTHER,2010-06-19,900,True,2,True,-0.089315,-0.032469,-0.056846
2,1,GILLARD,2010-06-19,900,True,26,True,0.054750,-0.024200,0.078950
3,1,OTHER,2010-06-19,900,True,26,True,0.010243,-0.023793,0.034037
4,2,GILLARD,2010-07-10,900,True,24,True,0.087387,-0.043153,0.130540
...,...,...,...,...,...,...,...,...,...,...
155,77,OTHER,2013-08-31,900,True,4,True,0.136919,0.014772,0.122147
156,78,GILLARD,2013-08-31,900,True,4,True,0.067425,-0.003225,0.070650
157,78,OTHER,2013-08-31,900,True,4,True,0.088657,0.001198,0.087460
158,79,GILLARD,2013-10-19,900,True,3,True,0.065325,0.065875,-0.000550


In [33]:

fig = px.scatter(entity_level_df[entity_level_df.leader_sentence_dummy=='GILLARD'], x="date", y="vader_sentimentr_diff", color="leader_sentence_dummy")

fig.show()



