# Creating and Visualizing Polarity Independence Scores 

In [None]:
# !pip install pysentimiento
# !pip install transformers

from tqdm import tqdm
import pandas as pd
import re
import pysentimiento
import pickle
from transformers import pipeline

In [None]:

# load dataset with preprocessed text
df = pd.read_csv('Data/dataset_token_ready.csv')

In [None]:

# define regex to search for mentions of president Ortega
regex = r'([Oo]rtega)|([Nn]uestro [Pp]residente)|[Pp]residente de [Nn]icaragua|([Cc]omandante [Dd]aniel)|[Dd]aniel y [Rr]osario'

# subset articles that contain mentions of president Ortega
df = df.loc[df['text'].str.contains(regex, na = False)].reset_index(drop = True)

# split string of texts into list of sentences
df["sentences"] = df.text.apply(lambda x: re.split("[.!?]", x))

# explode rows so that each row contains one sentence
df = df.explode("sentences", ignore_index = True)

# subset sentences that contain mentions of Ortega
df = df.loc[df["sentences"].str.contains(regex)].reset_index(drop = True)

# if sentence longer than 200 words, keep only 90 word window around first mention of Ortega
def trim_sentence(sentence, regex):
    words = sentence.split()
    if len(words) > 200:
        match = re.search(regex, sentence)
        if match:
            start_index = match.start()
            start_word_index = len(sentence[:start_index].split())
            window_start = max(0, start_word_index - 45)
            window_end = min(len(words), start_word_index + 45)
            return ' '.join(words[window_start:window_end])
    return sentence

df["sentences"] = df["sentences"].apply(lambda x: trim_sentence(x, regex))
# drop text column
df.drop("text", axis = 1, inplace = True)

sentences = df["sentences"].tolist()

In [None]:

analyzer = pysentimiento.create_analyzer(task="sentiment", lang="es")


# define function to analyze sentiment of sentence
def analyze_sentences(sentences):
    results = []
    for sentence in tqdm(sentences):
        sentiment = analyzer.predict(sentence).output
        results.append(sentiment)
    return results

In [None]:
analyser_distil = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=False
)



# define function to analyze sentiment of sentence
def analyze_sentences(sentences):
    results = []
    for sentence in tqdm(sentences):
        sentiment = analyser_distil(sentence)[0]["label"]
        results.append(sentiment)
    return results

In [None]:
results = analyze_sentences(sentences)
df["sentiment"] = results

In [None]:
with open("dataset_sentiment_new_class.pkl", "wb") as f:
    pickle.dump(df, f)

In [None]:
# Transform date to datetime format
dfa['date'] = pd.to_datetime(dfa['date'], format='mixed', errors='coerce')

# Create different date periods for aggregation
dfa["quarter"] = dfa.date.dt.to_period('Q')
dfa['quarter'] = dfa['quarter'].dt.strftime('%Y-%m')

# Create semiannual periods
def get_semiannual_period(date):
    year = date.year
    if date.month <= 6:
        return f"{year}Q1"
    else:
        return f"{year}Q3"

dfa['semiannual'] = dfa['date'].apply(get_semiannual_period)
dfa['semiannual'] = dfa['semiannual'].dt.strftime('%Y-%m')

dfa["year"] = dfa.date.dt.to_period('Y')
dfa['year'] = dfa['year'].dt.strftime('%Y')
dfa['year_month'] = dfa['date'].dt.strftime('%Y-%m')

In [None]:
# Change this variable to desired aggregation level
agg_level = "year_month"
dfa["date"] = dfa[agg_level]
dfa["date"] = pd.to_datetime(dfa["date"])

In [None]:
# Polarity and Independence Visualization

# Create aggregated overview of sentiment per position and year-quarter
df_agg = (dfa.groupby(["position", "date"])["sentiment"]
          .value_counts(normalize=True)
          .rename("proportion")
          .reset_index())

# Create polarity variable
df_agg.loc[df_agg["sentiment"] == "neutral", "polarity"] = 0
df_agg.loc[df_agg["sentiment"] == "positive", "polarity"] = df_agg["proportion"]
df_agg.loc[df_agg["sentiment"] == "negative", "polarity"] = df_agg["proportion"] * -1

# Create polarity aggregated data
df_pol = df_agg.groupby(["position", "date"]).agg({"polarity": np.sum}).reset_index()

In [None]:
# Create quarterly independence score visualizations

# Create table to see how many mentions per outlet per year
df_can = pd.pivot(df_pol, index="date", columns="position", values="polarity")

# Create independence scores
df_can["Canal10"] = (((abs(df_can["Canal10"] - df_can["regime"]) - abs(df_can["Canal10"] - df_can["opposition"])) / abs(df_can["opposition"] - df_can["regime"])) + 1) / 2
df_can["Canal14"] = (((abs(df_can["Canal14"] - df_can["regime"]) - abs(df_can["Canal14"] - df_can["opposition"])) / abs(df_can["opposition"] - df_can["regime"])) + 1) / 2
df_can["Radio Corporacion"] = (((abs(df_can["Radio Corporacion"] - df_can["regime"]) - abs(df_can["Radio Corporacion"] - df_can["opposition"])) / abs(df_can["opposition"] - df_can["regime"])) + 1) / 2


# Melt back to long format
df_ind = pd.melt(df_can.reset_index(), id_vars=['date'],
                 value_vars=['Canal10', "Canal14", "Radio Corporacion"],
                 value_name="independence score")
