In [134]:
import pandas as pd
import plotly.express as px
import os
import re
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentimentr.sentimentr import Sentiment as sentimentr
from datetime import datetime as dt
from nltk import tokenize
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def leaderFuzzySearch(sentence, name):
    top_set_match = process.extract(sentence, name, scorer=fuzz.token_set_ratio)

    for alias in top_set_match:
        print(alias)
        if alias[1] >= 90:
            return True
        else:
            continue
    return False


date_format = '%d-%b-%y'
custom_date_parser = lambda x: dt.strptime(x, date_format)

def vaderSentimentRGenerator(file, corel_param):
    file_path = file
    resolved_df = pd.read_csv('../02_Coreference_Resolution/leader_resolved/{}'.format(file_path))


    resolved_df['article_id'] = resolved_df.reset_index().index
    print(len(resolved_df))


    leader_of_interest = re.search('(.*)_resolved', file_path)[1]
    

    sentences_df  = pd.DataFrame([], columns=['article_id', 'sentence'])
    for row in resolved_df.index.tolist():
        id = resolved_df.loc[row, 'article_id']
        resolved_text = resolved_df.loc[row, 'resolved_text']
        try:
            sentence_list = tokenize.sent_tokenize(resolved_text)
        except:
            pass
        sentence_df = pd.DataFrame({"sentence" : sentence_list})
        sentence_df['article_id'] = id

        sentences_df = pd.concat([sentences_df, sentence_df], ignore_index=True)

    resolved_df = pd.merge(resolved_df, sentences_df, how='left', on='article_id')
    resolved_df['leader_sentence_dummy'] = resolved_df.apply(lambda row: leader_of_interest.upper() if leader_of_interest in row.sentence.lower() else "OTHER", axis=1)

    country_of_interest = resolved_df['country'].unique()[0]
    leaderid_of_interest = re.search('(?=leadid-(.*)_temp.csv)', file_path)[1]
    
    analyzer = SentimentIntensityAnalyzer()
    print(resolved_df['leader_sentence_dummy'].unique())

    resolved_df['vader_comp_score'] = resolved_df['sentence'].apply(lambda row: analyzer.polarity_scores(row)['compound'])
    resolved_df['sentimentr_comp_score'] = resolved_df['sentence'].apply(lambda row: sentimentr.get_polarity_score(row))
    # DIFFERENCE = VADER - SENTIMENTR 
    resolved_df['vader_sentimentr_diff'] = resolved_df.vader_comp_score - resolved_df.sentimentr_comp_score 

    resolved_df['date'] = pd.to_datetime(resolved_df['date'])

    entity_level_df = resolved_df.groupby(by=['article_id', 'leader_sentence_dummy', 'date', 'country'])['vader_comp_score', 'sentimentr_comp_score'].agg(pd.Series.tolist)
    entity_level_df.reset_index(inplace=True)
    entity_level_df['vader_comp_score'] = entity_level_df['vader_comp_score'].apply(lambda x: downweight_zero_mean(x))
    entity_level_df['sentimentr_comp_score'] = entity_level_df['sentimentr_comp_score'].apply(lambda x: downweight_zero_mean(x))
    entity_level_df['vader_sentimentr_diff'] = entity_level_df.vader_comp_score - entity_level_df.sentimentr_comp_score
    

    if corel_param:
        corel_fig = px.scatter(entity_level_df,
            x='vader_comp_score', y='sentimentr_comp_score',
            range_x=[-1,1], range_y=[-1,1], trendline="ols")
        fig_corel.add_shape(type="line", xref="paper", yref="paper",
            x0=0, x1=1,
            y0=0, y1=1,
            line = dict(
                color="DarkOrange",
                width=3))
        results = px.get_trendline_results(corel_fig)
        results_summary = results.px_fit_results.iloc[0].summary()
        results_as_html = results_summary.tables[1].as_html()
        coef_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
        coef_df = coef_df.rename(index={'const' : "constant"})
        coef_df['leader'] = leader_of_interest.upper()
        coef_df['country'] = country_of_interest.upper()
        coef_df['leadid'] = leaderid_of_interest
        return coef_df 
    else:
        pass


In [88]:
def downweight_zero_mean(x):
    dropped = [num for num in x if pd.isnull(num)==False]
    total_sum = sum(dropped)
    zeros = len([num for num in dropped if num==0])
    non_zeros = len([num for num in dropped if num!=0])
    if zeros == 0:
        downweighted_mean = total_sum /(non_zeros)
        return downweighted_mean
    else:
        try:
            downweighted_mean = total_sum /(non_zeros + math.sqrt(math.log(zeros))) 
            return downweighted_mean
        except:
            downweighted_mean = total_sum / len(dropped)
            return downweighted_mean


In [131]:
results = px.get_trendline_results(fig)
print(results)
results_summary = results.px_fit_results.iloc[0].summary()
results_as_html = results_summary.tables[1].as_html()
coef_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
coef_df = coef_df.rename(index={'const' : "constant"})
coef_df['leader', 'country'] = leader_of_interest.upper()
coef_df['country'] = 
coef_df

SyntaxError: invalid syntax (<ipython-input-131-aebc493fb831>, line 8)

In [None]:
fig = px.scatter(resolved_df, x="date", y="vader_comp_score", color="leader_sentence_dummy", trendline='lowess', trendline_options=dict(frac=1), hover_data=['sentence'])
fig.show()




In [None]:
fig = px.scatter(entity_level_df[entity_level_df.leader_sentence_dummy=='GILLARD'], x="date", y="vader_sentimentr_diff", color="leader_sentence_dummy")

fig.show()





In [None]:
fig = px.scatter(resolved_df, x='vader_comp_score', y='sentimentr_comp_score', range_x=[-1,1], range_y=[-1,1], trendline="ols")
fig.add_shape(type="line", xref="paper", yref="paper",
    x0=0, x1=1,
    y0=0, y1=1,
    line = dict(
        color="DarkOrange",
        width=3))
fig.show()

In [None]:
fig_corel = px.scatter(entity_level_df, x='vader_comp_score', y='sentimentr_comp_score', range_x=[-1,1], range_y=[-1,1], trendline="ols")
fig_corel.add_shape(type="line", xref="paper", yref="paper",
    x0=0, x1=1,
    y0=0, y1=1,
    line = dict(
        color="DarkOrange",
        width=3))
fig_corel.show()

In [136]:
from os import listdir
from os.path import isfile, join

folder_path = '../02_Coreference_Resolution/leader_resolved'
coef_together_df = pd.DataFrame([])
files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
for f in files:
    leader_coef_df = vaderSentimentRGenerator(f, True)
    coef_together_df = pd.concat([coef_together_df, leader_coef_df])


233
['OTHER' 'AZNAR']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



17
['OTHER' 'CAMPBELL']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



1131
['OTHER' 'CHIRAC']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



177
['OTHER' 'CHRETIEN']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



11
['OTHER' 'FAYMANN']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



80
['OTHER' 'GILLARD']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



145
['OTHER' 'GONZALEZ']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



28
['OTHER' 'GOWDA']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



27
['OTHER' 'GUJRAL']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



7
['OTHER' 'GUSENBAUER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


kurtosistest only valid for n>=20 ... continuing anyway, n=14



308
['OTHER' 'HARPER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



456
['HOLLANDE' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



815
['OTHER' 'HOWARD']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



73
['KEATING' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



6
['KERN' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


kurtosistest only valid for n>=20 ... continuing anyway, n=12



14
['OTHER' 'KLIMA']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



24
['KURZ' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



562
['OTHER' 'MACRON']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



338
['OTHER' 'MARTIN']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



4
['MITTERAND' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


kurtosistest only valid for n>=20 ... continuing anyway, n=8


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



2
['MITTERLEHNER' 'OTHER']



omni_normtest is not valid with less than 8 observations; 4 samples were given.



497
['MODI' 'OTHER']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



26
['OTHER' 'MULRONEY']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



79
['OTHER' 'OBUCHI']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



185
['OTHER' 'RAJOY']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



113
['OTHER' 'RAO']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



98
['OTHER' 'RUDD']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



18
['OTHER' 'RUDD']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



10
['OTHER' 'SANCHEZ']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



868
['OTHER' 'SARKOZY']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



43
['OTHER' 'SCHUSSEL']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



445
['OTHER' 'SINGH']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



151
['OTHER' 'TRUDEAU']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



75
['OTHER' 'TURNBULL']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



5
['OTHER' 'VAJPAYEE']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


kurtosistest only valid for n>=20 ... continuing anyway, n=10



173
['OTHER' 'VAJPAYEE']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



13
['OTHER' 'VRANITZKY']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



212
['OTHER' 'ZAPATERO']



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [165]:
coef_together_df = coef_together_df.loc['x1', :]
coef_together_df['leader_order'] = coef_together_df.groupby(by=['country']).rank(method='dense', )
coef_together_df[coef_together_df.country == "SPAIN"]
print(coef_together_df.head(5)['P>|t|'])

print(list(zip(coef_together_df.head(10).leader.tolist(), coef_together_df.head(10)['P>|t|'].tolist())))


x1    0.000
x1    0.000
x1    0.000
x1    0.000
x1    0.009
Name: P>|t|, dtype: float64
[('AZNAR', 0.0), ('CAMPBELL', 0.0), ('CHIRAC', 0.0), ('CHRETIEN', 0.0), ('FAYMANN', 0.009), ('GILLARD', 0.0), ('GONZALEZ', 0.0), ('GOWDA', 0.0), ('GUJRAL', 0.0), ('GUSENBAUER', 0.004)]


In [169]:
import plotly.graph_objects as go

corel_heatmap = go.Figure(data=go.Heatmap(
    z=coef_together_df.coef,
    x=coef_together_df.country,
    y=coef_together_df.leader_order,
    customdata=list(zip(coef_together_df.leader.tolist(), coef_together_df['P>|t|'].tolist())),
    colorscale='Viridis',
    hoverongaps=False,
))

corel_heatmap.update_traces(
        hovertemplate="<br>".join([
            "Leader: %{customdata[0]}",
            "P-value: %{customdata[1]} <extra></extra>"
        ])
)

corel_heatmap.show()