In [49]:
from transformers import pipeline
from datasets import load_dataset
import nltk
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from rapidfuzz import fuzz
import spacy
from hashlib import sha256

In [2]:
nltk.download('punkt')  # Download the necessary resources
plt.style.use('ggplot')
# Load the SpaCy model (use a language model that covers your corpus language)
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/martins_32048/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

In [4]:
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
sentiment_task("Covid cases are increasing fast!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'label': 'negative', 'score': 0.7235768437385559}]

In [5]:
import pandas as pd

In [62]:
dataset = load_dataset("biglam/hmd_newspapers")["train"]

Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/29 [00:00<?, ?it/s]

In [63]:
dataset[0]

{'source': 'British Library Heritage Made Digital Newspapers',
 'title': 'The Liverpool Standard and General Commercial Advertiser.',
 'location': 'Liverpool, Merseyside, England',
 'date': datetime.datetime(1855, 7, 17, 0, 0),
 'item_type': 'ARTICLE',
 'word_count': 2226,
 'ocr_quality_mean': 0.9616,
 'ocr_quality_sd': 0.0951,
 'text': 'alcs by Ruction.\n\nBy Messrs. THOS. WINSTANLEY and SONS,\nTHtsDAY(Tuesday), the 17th inst.\' at Two o\'clock in the Afternoon,\nat the Clarendon-rooms, South John-street, Liverpool, subject\nto conditions then to be produced.\nAPIECE of LAND, with the WORKSHOPS,\nCOACH-HOUSE, STABLE, COUNTING-HOUSE, and\nother Buildings thereon erected, situate on the west side of Scot-\nland-road, within Liverpool, and now in the occupation of Mr.\nThomas Mackarell, builder.\nThe Land contains in front to Scotland-road 104 feet 3 inches,\nand in breadth at the back to Gore street 100 feet 9 inches, and\nruns in depth on the north side 120 feet 9 inches, and on the so

In [64]:
dataset = dataset.filter(lambda x: x["ocr_quality_mean"] and x["ocr_quality_mean"] > 0.8)

In [69]:
def process(example):
    example["text"] = example["text"].replace("\n", " ")
    example["text"] = example["text"].replace("- ", "")
    return example

In [70]:
dataset = dataset.map(process)

Map:   0%|          | 0/1762137 [00:00<?, ? examples/s]

In [67]:
def get_hash(examples):
    examples["hash"] = sha256(examples["text"].encode('utf-8')).hexdigest()[:16]
    return examples

In [None]:
dataset = dataset.map(get_hash)

Map:   0%|          | 0/1762137 [00:00<?, ? examples/s]

In [71]:
def sentence_split(examples):
    sentences = sum([sent_tokenize(ex) for ex in examples["text"]], [])
    source = [examples["hash"]] * len(sentences)
    return {"sentence": sentences, "source": source}

In [73]:
dataset = dataset.map(sentence_split, batched=True, remove_columns=dataset.column_names, batch_size=100)

Map:   0%|          | 0/1762137 [00:00<?, ? examples/s]

In [74]:
dataset = dataset.flatten()

Dataset({
    features: ['source', 'sentence'],
    num_rows: 86034743
})

In [77]:
def mentions_nation(example):
    doc = nlp(example["sentence"])
    # Extract nationalities (NORP entities)
    example["nationality"] = [ent.text for ent in doc.ents if ent.label_ == "NORP"]
    return example

In [78]:
dataset = dataset.shuffle(seed=42).select(range(100_000)).map(mentions_nation).filter(lambda x: len(x["nationality"]))

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [81]:
def get_score(example):
    pred = sentiment_task(example["sentence"][:1000])[0]
    if pred["label"] == "positive":
        score = pred["score"]
    elif pred["label"] == "negative":
        score = -pred["score"]
    else:
        score = .0
    example["score"] = score
    return example

In [82]:
dataset = dataset.map(get_score).filter(lambda x: x["score"] < 0.)

Map:   0%|          | 0/5945 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5945 [00:00<?, ? examples/s]

In [83]:
keywords = set()
for n in dataset["nationality"]:
    lower = [x.lower() for x in n]
    keywords.update(lower)
for k in keywords:
    print(k)

indian
ennese
paguerre
ultra-tory
asphalte
romish
carlists
severn
catholic
•hese
bagster
texan
south stafford
danish
league
europeans
christians
neapolitan
goods
rembrandt
marylanders
guards
saviour
siamese
tho american
eastern
roman catholic
yarmouth
democrats
irish
canadian
orange
moplabs
cantonese
imperialism
estensian
reddish
thuggism
protectionists
jew
druggists
tragedians
miludi
essex
swiss
trains.—american
federalists
teutonic
romanists
western
draconian
fenian
babglonish
adriatic
americans
calcur
mussulmen
danubian
brazilian
notiese
kentish
sardinian
sabre
unfit—(evans
canadians
conservatives
dodger
norwegian
persian
republican
necauese
parisian
de-44
petitioners
royalist
ira
peruvian
treasury.se
anti-christian
nsife
hairdresser
jews
paridans
fenians
beans
chartists
mexicans
conservative
popish
greek
siberian
italian
pergami
jewish
european
dutch
vatairds-ef
swede
tories
visans
southerners
-4hoseshef.cl'eeuvrestliese
eetg
confederates
the'deek
german
jesuit
pans
ballarat
russia

In [84]:
with open("keywords.txt", "w") as fout:
    for k in keywords:
        fout.write(k+"\n")

In [85]:
for example in dataset.filter(lambda x: x["score"] < -0.9):
    print(example)

Filter:   0%|          | 0/761 [00:00<?, ? examples/s]

{'source': ['42c88694a3153aa7', '41e6a752a8512699', '9e3ea40acb9a7327', '101297bbb7d6874a', '16844b73058885ea', 'fed60f347f903d41', '9b7ceaecd5d78331', 'd59f90b28de32146', 'cbfaca017dfd0ca6', '6514324b156b822d', '507b7d263a6f2080', '76ac609c345605d5', '3bc91ce9ece667df', '93067682e96bbb76', 'e80373543e16c8d2', 'f3716c5d2f5e87ce', '742ef75ed8ee2bd9', '57f71ebf36bc9e0d', 'dfdc5b4833c3b833', 'da8db862876b969c', '0f16753cc09ebb9c', 'b1885518e5821edd', '498233f9564d56b5', 'ea7db90c84fd14f3', 'd2c11a4f887566f3', '42e9f86a9437fe85', 'e570580aeefde443', '0d103953c9fcb467', '759bdd20fcd741cf', 'e746a0f184e4a20d', '6726e8398cd3fc03', '62d4651e1e2152c2', '279dbd770dfe7caa', '677e7c6dc7fbee86', '299a4cbd79362217', 'cca49886a9a96b44', 'bfcedc9cf5fa3acd', '6de12dfdffe02cfd', '6e2b5cea58baa0c0', 'aa7e7a95d17a10c5', 'fe2a3aba2fc34273', 'cb3aa922c76871e6', 'ff7aec207cbbaa61', 'adc4f4ecaf3f1653', '9cc32bc0ab11410f', 'ce345c93a0b9bc6b', '47a4ca3328d0c384', '38fdffdc9b8a0ebc', '27a56425b84122b6', 'f727428

In [86]:
dataset[0]

{'source': ['d6191a4143f97c29',
  '2b027a24397568f0',
  'fb39f19450b47337',
  'fe431db67f7bf7a4',
  '59dbd247f06b69c0',
  '66b91798069e80d5',
  'd83fa7542d36609a',
  'f37e23b684a1eb5d',
  'fa649e968cd7276e',
  '3453b2f24d252125',
  '5f53f991e70cb72e',
  'e565a80bd3743a0b',
  '60925b1ed81c1b80',
  '88968507e86b8d62',
  '4041f7942d5b0468',
  'da88246c4de196dc',
  '70a3b0f4d254f999',
  '2078d1bdeb0071eb',
  '71928e4d7660bee2',
  '7728b0a6983f1273',
  'b3d166e7287a3b4d',
  '6447bfd950f9404d',
  '4ea6c38eef8c72c7',
  'd8515b55fd93b87c',
  '6d6993675bdf8da5',
  'a8eb2fa25cd4bacb',
  '2147ae007a226cd6',
  'e9a05c4bf8c65410',
  '3bd42caf2fd5a4a3',
  'd5fa9f5e3b8030b9',
  '10188102190022dc',
  'edaa1fefe83f24df',
  '68a25b4520231112',
  'd65e7ec367944285',
  '70017a30045ebbad',
  'b1064bbc6b16c8f6',
  'e1b190ec97a82ee9',
  'd78693446589e70c',
  '8c871b639f0d3124',
  'c120d511c209d48c',
  'ca36640ffeddd875',
  'd774c351aeb84994',
  '39791c2372beadee',
  'c57f80ac32062f49',
  '842ca903396d60a5',


In [22]:
column = "text"
threshold = 90
count = 0
to_drop = []

for index1, row1 in df.iterrows():
    for index2, row2 in df.iloc[index1+1:].iterrows():
        if fuzz.ratio(row1[column], row2[column]) > threshold:
            to_drop.append(index2)
            count += 1

NameError: name 'df' is not defined

In [None]:
df = df.drop(to_drop)
print("Number of duplicated rows:", count)

In [None]:
# Aggregate the scores by date and calculate the mean for each day
df.index = pd.to_datetime(df['date'])
df_aggregated = df.resample(rule='YE')['score'].mean().reset_index().fillna(0.)

In [None]:
# Plotting the data
plt.figure(figsize=(20, 4))
plt.plot(df_aggregated['date'], df_aggregated['score'], marker='o')
plt.title('Average Score Over Time')
plt.xlabel('Date')
plt.ylabel('Average Score')
plt.xticks(rotation=45)  # Rotate date labels for better readability
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
df_aggregated['smoothed_score'] = df_aggregated['score'].rolling(window=3, min_periods=1).mean()

In [None]:
# Plot the original and smoothed data
plt.figure(figsize=(15, 4))
plt.plot(df_aggregated['date'], df_aggregated['score'], label='Original', marker='o', alpha=0.6)
plt.plot(df_aggregated['date'], df_aggregated['smoothed_score'], label='Smoothed (Rolling Avg)', linewidth=2)
plt.title('Average Score Over Time with Smoothing')
plt.xlabel('Date')
plt.ylabel('Average Score')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
df[df.score < 0]

In [None]:
df[df.score < -0.9]["text"].values