In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install spacy
!python -m spacy download es_core_news_sm
! pip install tqdm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import re

In [13]:
import os
import glob
import random
import xml.etree.ElementTree as ET
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

sentiment_analyzer = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

route = "/content/drive/MyDrive/Colab.Notebooks/Digital Humanities/Project/Spanish_ELTec_Corpus"
files = glob.glob(os.path.join(route, "*_annotated.xml"))

# Key words
key_words = [
    "Francia", "Inglaterra", "París", "Roma", "Italia", "Flandes", "Londres",
    "América", "Portugal", "Nápoles", "Maya", "Fátima", "Alemania", "Rajatul-laj",
    "Occidente", "Austria", "África", "Túnez", "Birmingham", "Habana", "Lisboa",
    "Cuba", "Bruselas", "Méjico", "Biarritz", "Indias", "Asia", "Egipto",
    "Constantinopla", "Holanda", "Damasco", "Filipinas", "India", "Bretaña",
    "Rusia", "Grecia", "Escocia"
]

def extract_tokens(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    tokens = []
    for w in root.findall('.//tei:w', ns):
        text = w.text
        if text:
            tokens.append({
                "text": text,
                "pos": w.attrib.get("pos", ""),
                "lemma": w.attrib.get("lemma", ""),
                "ner": w.attrib.get("ner", "")
            })
    return tokens

# Obtain the year of publication in order to compare periods
def get_year(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    date_node = root.find('.//tei:sourceDesc//tei:bibl[@type="firstEdition"]/tei:date', ns)
    if date_node is not None and date_node.text and date_node.text.strip().isdigit():
        return int(date_node.text.strip())
    return None

# Classify the years in 2 decade periods
def classify_year(year):
    if year is None:
        return "unknown"
    if 1840 <= year <= 1859:
        return "1840-1859"
    elif 1860 <= year <= 1879:
        return "1860-1879"
    elif 1880 <= year <= 1899:
        return "1880-1899"
    elif 1900 <= year <= 1920:
        return "1900-1920"
    return "Out of range"

# Get windows of text
def windows(tokens, key_word, file_, period, window=100):
    windows = []
    for i, token in enumerate(tokens):
        if token["text"] == key_word:
            start = max(i - window, 0)
            end = min(i + window + 1, len(tokens))
            context = " ".join([tok["text"] for tok in tokens[start:end]])
            windows.append({
                "key_word": key_word,
                "text_window": context,
                "file": os.path.basename(file_),
                "period": period
            })
    return windows

# Get all the windows that contain the key_word
all_windows = []

for file_ in tqdm(files, desc="Processing all files"):
    year = get_year(file_)
    period = classify_year(year)
    tokens = extract_tokens(file_)

    for word in key_words:
        new = windows(tokens, word, file_, period)
        all_windows.extend(new)

# Sentiment Analysis in all the windows that have one key_word on the text
for window in tqdm(all_windows, desc="Analyzing sentiments"):
    try:
        result = sentiment_analyzer(window["text_window"][:512])[0]
        window["sentiment"] = result["label"]
        window["confidence_score"] = result["score"]
    except Exception as e:
        window["sentiment"] = "error"
        window["confidence_score"] = 0.0


df = pd.DataFrame(all_windows)

Device set to use cpu
Processing all files: 100%|██████████| 87/87 [03:17<00:00,  2.27s/it]
Analyzing sentiments: 100%|██████████| 4992/4992 [20:29<00:00,  4.06it/s]


In [14]:
df_all = df.sort_values(by=["key_word", "confidence_score"], ascending=[True, False])
df_all.to_csv("/content/drive/MyDrive/Colab.Notebooks/Digital Humanities/Project/data/all_countries.csv", index=False)

## Visualize the number of total files per period

In [8]:
df_all = pd.read_csv("/content/drive/MyDrive/Colab.Notebooks/Digital Humanities/Project/data/all_countries.csv")
df_all['period'].value_counts()

Unnamed: 0_level_0,count
period,Unnamed: 1_level_1
1900-1920,2051
1840-1859,1039
1880-1899,907
1860-1879,677
unknown,318


In [9]:
counting = df_all.groupby('key_word')['sentiment'].value_counts().unstack(fill_value=0)
print(counting)

sentiment       Negative  Neutral  Positive  Very Negative  Very Positive
key_word                                                                 
Alemania               7       13        20             41             25
América                5        9        25             61             64
Asia                   1        4         8             22             13
Austria                6       37       117            158            128
Biarritz               0        5        10             17             22
Birmingham             5        7        15             25             18
Bretaña                2        3         9             17             11
Bruselas               2        2        14             35             16
Constantinopla         2        2        10             23             11
Cuba                   2        4         7             27             20
Damasco                0        0         5              2              1
Egipto                 0        5     

## Visualize the number of selected files per period

In [10]:
import re

df_all['file'] = df_all['file'].astype(str)

# Modifying the incorrect labeled files
periods = {
    'SPA3029_Blanco_Cercado_annotated.xml': '1880-1899',
    'SPA2021_RafaelDelCastillo_LosCaballerosDelAmor_annotated.xml': '1860-1879',
    'SPA3002_LeopoldoAlas_LaRegenta_annotated.xml': '1900-1920',
    'SPA1022_GomezDeAvellaneda_DosMujeres_annotated.xml': '1840-1859'
}


for file, period in periods.items():
    df_all.loc[df['file'] == file, 'period'] = period

# Collapsing the labels into 2 and droping the Neutral instances
reduced_sentiment = {
    'Very Negative': 'Negative',
    'Very Positive': 'Positive',
    'Neutral': None
}

df_all['sentiment'] = df_all['sentiment'].map(reduced_sentiment)
df_all = df_all.dropna(subset=['sentiment'])

# Selecting only some countries to analyse
countries = ['Cuba', 'Asia', 'América', 'Egipto', 'Filipinas']
regex = '|'.join(map(re.escape, countries))
df_selected = df_all[df_all['key_word'].str.contains(regex, case=False, na=False)]


In [11]:
df_selected['period'].value_counts()

Unnamed: 0_level_0,count
period,Unnamed: 1_level_1
1860-1879,83
1880-1899,68
1900-1920,52
unknown,32
1840-1859,29


In [12]:
counting = df_selected.groupby('key_word')['sentiment'].value_counts().unstack(fill_value=0)
print(counting)

sentiment  Negative  Positive
key_word                     
América          61        64
Asia             22        13
Cuba             27        20
Egipto           12        18
Filipinas        15        12
