In [272]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import re
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentimentr.sentimentr import Sentiment as sentimentr
from datetime import datetime as dt
from nltk import tokenize
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from os import listdir
from os.path import isfile, join


def leaderFuzzySearch(sentence, name):
    top_set_match = process.extract(sentence, name, scorer=fuzz.token_set_ratio)

    for alias in top_set_match:
        print(alias)
        if alias[1] >= 90:
            return True
        else:
            continue
    return False


date_format = '%d-%b-%y'
custom_date_parser = lambda x: dt.strptime(x, date_format)

def vaderSentimentRGenerator(file, corel_param):
    file_path = file
    resolved_df = pd.read_csv('../02_Coreference_Resolution/leader_resolved/{}'.format(file_path))


    resolved_df['article_id'] = resolved_df.reset_index().index

    leader_of_interest = re.search('(.*)_resolved', file_path)[1]
    

    sentences_df  = pd.DataFrame([], columns=['article_id', 'sentence'])
    for row in resolved_df.index.tolist():
        id = resolved_df.loc[row, 'article_id']
        resolved_text = resolved_df.loc[row, 'resolved_text']
        try:
            sentence_list = tokenize.sent_tokenize(resolved_text)
            sentence_df = pd.DataFrame({"sentence" : sentence_list})
            sentence_df['article_id'] = id
            sentences_df = pd.concat([sentences_df, sentence_df], ignore_index=True)
        except:
            pass
        
        



    resolved_df = pd.merge(resolved_df, sentences_df, how='left', on='article_id')
    resolved_df.fillna('', inplace=True)
    resolved_df['leader_sentence_dummy'] = resolved_df.apply(lambda row: leader_of_interest.upper() if leader_of_interest in row.sentence.lower() else "OTHER", axis=1)

    country_of_interest = resolved_df['country'].unique()[0]
    leaderid_of_interest = re.search('(?=leadid-(.*)_temp.csv)', file_path)[1]
    
    analyzer = SentimentIntensityAnalyzer()
    print(resolved_df['leader_sentence_dummy'].unique())

    resolved_df['vader_comp_score'] = resolved_df['sentence'].apply(lambda row: analyzer.polarity_scores(row)['compound'])
    resolved_df['sentimentr_comp_score'] = resolved_df['sentence'].apply(lambda row: sentimentr.get_polarity_score(row))
    # DIFFERENCE = VADER - SENTIMENTR 
    resolved_df['vader_sentimentr_diff'] = resolved_df.vader_comp_score - resolved_df.sentimentr_comp_score 

    resolved_df['date'] = pd.to_datetime(resolved_df['date'])

    entity_level_df = resolved_df.groupby(by=['article_id', 'leader_sentence_dummy', 'date', 'country'])['vader_comp_score', 'sentimentr_comp_score'].agg(pd.Series.tolist)
    entity_level_df.reset_index(inplace=True)
    entity_level_df['vader_comp_score'] = entity_level_df['vader_comp_score'].apply(lambda x: downweight_zero_mean(x))
    entity_level_df['sentimentr_comp_score'] = entity_level_df['sentimentr_comp_score'].apply(lambda x: downweight_zero_mean(x))
    entity_level_df['vader_sentimentr_diff'] = entity_level_df.vader_comp_score - entity_level_df.sentimentr_comp_score
    

    corel_fig = px.scatter(entity_level_df,
        x='vader_comp_score', y='sentimentr_comp_score',
        range_x=[-1,1], range_y=[-1,1], trendline="ols")
    fig_corel.add_shape(type="line", xref="paper", yref="paper",
        x0=0, x1=1,
        y0=0, y1=1,
        line = dict(
            color="DarkOrange",
            width=3))
    results = px.get_trendline_results(corel_fig)
    results_summary = results.px_fit_results.iloc[0].summary()
    results_as_html = results_summary.tables[1].as_html()
    coef_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    coef_df = coef_df.rename(index={'const' : "constant"})
    coef_df['leader'] = leader_of_interest.upper()
    coef_df['country'] = country_of_interest.upper()
    coef_df['leadid'] = leaderid_of_interest
    coef_df['rsquared'] = results.px_fit_results.iloc[0].rsquared
    return coef_df, entity_level_df



In [88]:
def downweight_zero_mean(x):
    dropped = [num for num in x if pd.isnull(num)==False]
    total_sum = sum(dropped)
    zeros = len([num for num in dropped if num==0])
    non_zeros = len([num for num in dropped if num!=0])
    if zeros == 0:
        downweighted_mean = total_sum /(non_zeros)
        return downweighted_mean
    else:
        try:
            downweighted_mean = total_sum /(non_zeros + math.sqrt(math.log(zeros))) 
            return downweighted_mean
        except:
            downweighted_mean = total_sum / len(dropped)
            return downweighted_mean


In [None]:
folder_path = '../02_Coreference_Resolution/leader_resolved'
coef_together_df = pd.DataFrame([])
entity_together_df = pd.DataFrame([])

files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]

for f in files:
    # leader_coef_df = vaderSentimentRGenerator(f, True)
    # coef_together_df = pd.concat([coef_together_df, leader_coef_df])
    leader_coef_df, entity_level_df = vaderSentimentRGenerator(f, False)
    coef_together_df = pd.concat([coef_together_df, leader_coef_df])
    entity_together_df = pd.concat([entity_together_df, entity_level_df])

#drop statistics on the intercept term
coef_together_df = coef_together_df.loc['x1', :]
coef_together_df.reset_index(inplace=True)
coef_together_df['leader_order'] = coef_together_df.groupby(by=['country'])['coef'].rank(method='dense', ascending=False)

In [None]:
# LEADER LEVEL SCATTER PLOT
leadername="KOIZUMI"

leader_scatter = px.scatter(entity_together_df[entity_together_df.leader_sentence_dummy==leadername].sort_values("country"),
    x="vader_comp_score",
    y="sentimentr_comp_score",
    color="country",
    range_x=[-1,1], range_y=[-1,1], trendline="ols",
   )

leader_scatter.update_layout(
    title=leadername
)

leader_scatter.show()

In [294]:
# HEATMAP OF R^2
corel_heatmap = go.Figure(data=go.Heatmap(
    z=coef_together_df.rsquared,
    x=coef_together_df.country,
    y=coef_together_df.leader_order,
    text=coef_together_df.leader,
    customdata=list(zip(coef_together_df.leader.tolist(), coef_together_df['P>|t|'].tolist())),
    colorscale='Viridis',
    hoverongaps=False,

))

corel_heatmap.update_traces(
        hovertemplate="<br>".join([
            "Leader: %{customdata[0]}",
            "R^2: %{z}",
            "P-value: %{customdata[1]} <extra></extra>"
        ])
)

corel_heatmap.update_layout(
    title="R^2 for each leader's <i>sentimentR=a+b*VADER</i>",
    xaxis_title="Country",
    yaxis_title="Leader"
)

corel_heatmap.show()

In [295]:
# SENTIMENT VS VADER COREL FACET PLOT
corel_facet = px.scatter(entity_together_df[entity_together_df.leader_sentence_dummy!="OTHER"].sort_values("country"),
    x="vader_comp_score",
    y="sentimentr_comp_score",
    color="country",
    facet_col="leader_sentence_dummy",
    range_x=[-1,1], range_y=[-1,1], trendline="ols",
    facet_col_wrap=4,
    facet_row_spacing=0.015,
    width=1400,
    height=3000,)


corel_facet.add_trace(go.Scatter(x=[-1,1],y=[-1,1],
    mode='lines',
    line=dict(color="black"),
    showlegend=False 
), row="all",
    col="all",
    exclude_empty_subplots=True)
    
corel_facet.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    xaxis_title="VADER Score",
    yaxis_title="SentimentR Score"
)
corel_facet.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
corel_facet.show()

In [None]:
import chart_studio
import chart_studio.plotly as py
chart_studio.tools.set_credentials_file(username="jyl63", api_key="ijTl8oDrEZm28lGMdnOn")

py.plot(corel_facet, filename="VADER vs SentimentR correlation faceted")