# Sentiment Analysis of Articles and Posts

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.intel.openvino import OVModelForSequenceClassification

Specify File Paths and Hardware

In [None]:
#specify file location
con = sqlite3.connect('corpus.sqlite3')

#specify the path to the model
model_path = "RobertaGerman"

#choose if you want to use openvino - for intel gpu's
openvino = False 

# Determine device for model inference
if openvino:
    device = 'GPU'   #intel integrated graphics gpu
elif torch.cuda.is_available():
    device = "cuda"  #nvidea gpu
elif torch.backends.mps.is_available():
    device = "mps"   #apple gpu
else:
    device = "cpu"   #use cpu if no gpu is available

print(f"Using device: {device}")

#set bazch size depending on the device
batch_size = 16 if openvino or device == 'cpu' else 32

Using device: GPU


Load Model and set Global Parameters

In [3]:
#maximum sequence length for the model
max_length = 512

#overlap for splitting long sequences
overlap = 64

#links from label ids to emotion names
labels = {
    "LABEL_0": "Anger",
    "LABEL_1": "Fear",
    "LABEL_2": "Disgust",
    "LABEL_3": "Sadness",
    "LABEL_4": "Joy",
    "LABEL_5": "None"
}
emotions = list(labels.values()) #store emotion names in a list

#load tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_path)

#load the model that fits the device
if openvino:
    model = OVModelForSequenceClassification.from_pretrained(model_path, device=device)
else:
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to(device)

#set the model to evaluation mode
model.eval()

No OpenVINO files were found for RobertaGerman, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


<optimum.intel.openvino.modeling.OVModelForSequenceClassification at 0x21b8de73770>

Helper Functions for Artilce and Post Procesing

1. Prepare data for Analysis

In [4]:
def chunk_text(df, text_col):

    #remove spaces from start and end of text column
    df[text_col] = df[text_col].str.strip()

    chunked_rows = []
    for _, row in df.iterrows():
        
        #convert text to string and tokenize
        text = str(row[text_col])
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        
        #split tokens into chunks of max_length with overlap
        start = 0
        while start < len(token_ids):
            end = start + max_length
            chunk_ids = token_ids[start:end]
            chunk_text = tokenizer.decode(chunk_ids)

            #create a new row with the chunked text
            chunk_row = row.copy()
            chunk_row[text_col] = chunk_text
            chunked_rows.append(chunk_row)

            #stop if reached the end & restart with overlap
            if end >= len(token_ids):
                break
            start += max_length - overlap
    
    #return a new df with chunked rows
    return pd.DataFrame(chunked_rows).reset_index(drop=True)

2. Perform a Sentiment Analysis

In [5]:
def sentiment_analysis(df, column):
    
    #move model to device if nescessary
    if openvino:
        print(device)
    else:
        model.to(device)
        print(device)

    #batch the dataframe for parallel processing
    all_scores = []
    for b in range(0, len(df), batch_size):
        batch_texts = df[column].iloc[b:b + batch_size].tolist()
        
        #specify model inputs as tokenized chunks
        if openvino:
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
                add_special_tokens=True
            )
        else:   #move inputs to device if not openvino
            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
                add_special_tokens=True
            ).to(device)
        
        #get model predictions
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        #convert logits to probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        #store batch results
        all_scores.append(probs.cpu().detach().numpy())
    
    #combine all batches into a df
    all_scores_np = np.vstack(all_scores)
    df_scores = pd.DataFrame(all_scores_np, columns=emotions)
    return pd.concat([df.reset_index(drop=True), df_scores], axis=1)

3. Recombine Chunks of Texts

In [6]:
def combine_chunks(df, text_col, group_col, metadata_cols):

    #compute number of tokens per chunk
    df['chunk_tokens'] = df[text_col].apply(
        lambda x: len(tokenizer.encode(str(x), add_special_tokens=False))
    )

    #compute weighted emotion scores by number of tokens
    weighted_cols = []
    for emo in emotions:
        df[f'{emo}_weighted'] = df[emo] * df['chunk_tokens']
        weighted_cols.append(f'{emo}_weighted')
    
    #group by the group_col and sum weighted scores and tokens
    grouped = df.groupby(group_col)
    sums = grouped[weighted_cols + ['chunk_tokens']].sum()
    
    #compute weighted average for each emotion
    result = pd.DataFrame(index=sums.index)
    for emo in emotions:
        result[emo] = sums[f'{emo}_weighted'] / sums['chunk_tokens']

    #identify the dominant emotion per group
    result['dominant_emotion'] = result[emotions].idxmax(axis=1)   

    #merge data back with metadata
    cols_to_select = metadata_cols.copy()
    if group_col not in cols_to_select:
        cols_to_select.append(group_col)
    meta_df = df[cols_to_select].drop_duplicates(subset=group_col).set_index(group_col)

    #return a joined df with original text lenght
    return result.join(meta_df).reset_index()

4. Create a Table with Descriptive Statistics over Topics

In [7]:
def descriptives_topic(df):

    #compute mean and standard deviation of emotions per NewsroomTopic
    topic_dict = {}
    for topic, group in df.groupby('NewsroomTopic'):
        descriptives = {}
        for e in emotions:
            descriptives[f"{e}_mean"] = group[e].mean()
            descriptives[f"{e}_std"] = group[e].std()
        topic_dict[topic] = descriptives
    
    #convert the dictionary to a df
    table = pd.DataFrame.from_dict(topic_dict, orient='index')
    return table.reset_index().rename(columns={'index': 'NewsroomTopic'})

# Articles preprocessing

In [8]:
#read articles from the database
articles = pd.read_sql_query("SELECT * FROM Articles", con)

#save metadata columns
articles_meta = ['ID_Article', 'publishingDate', 'NewsroomTopic']

#combine title and body converted to text
articles['body_text'] = articles.apply(
    lambda row: str(row['Title']) + " " +
                BeautifulSoup(str(row['Body']), "html.parser").get_text(),
    axis=1
)
articles

Unnamed: 0,ID_Article,Path,publishingDate,Title,Body,body_text
0,1,Newsroom/User/Community,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Newsletter von derStandard.at Abonnieren S...
1,2,Newsroom/User/Community/Regeln,2012-05-26 12:12:19.46,Werden Sie Teil von derStandard.at!,"<div class=""diashow"" id=""objectContent""><meta ...",Werden Sie Teil von derStandard.at! Werden Sie...
2,3,Diverses/mobil,2013-11-22 12:15:00.00,Die Android App von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Android App von derStandard.at Die Smartph...
3,4,Newsroom/User/mitmachen/Mitreden,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,"<div class=""section"" id=""content-main"" itempro...",Welche Erfahrungen haben Sie als Linkshänder g...
4,5,Newsroom/User/mitmachen/Mitreden,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt?,"<div class=""section"" id=""content-main"" itempro...",Wie haben Sie das Jahr 1989 erlebt? Erzählen S...
...,...,...,...,...,...,...
12082,12083,Newsroom/Kultur/Musikkultur,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule,"<div class=""section"" id=""content-main"" itempro...",Max Prosa: Junger Troubadour alter Schule Von ...
12083,12084,Newsroom/Etat/PRINT/Springer,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen""","<div class=""section"" id=""content-main"" itempro...","""Können Adblocker nicht einfach hinnehmen"" Med..."
12084,12085,Meinung/Kolumnen/rau,2016-05-31 17:34:54.00,Die Rechten machen Facebook zum Hatebook,"<div class=""section"" id=""content-main"" itempro...",Die Rechten machen Facebook zum Hatebook Die F...
12085,12086,Newsroom/Kultur/Buehne,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...","<div class=""section"" id=""content-main"" itempro...","""Wrestling Rita"": Feministischer Punktsieg im ..."


Apply Filters

In [9]:
#keep only articles from the 'Newsroom' section
articles['MainPath'] = articles['Path'].apply(lambda x: str(x).split('/')[0])
articles = articles[articles['MainPath'] == 'Newsroom'].copy()

#extract topic from the path
articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])

#exclude user-generated and internal articles
articles = articles[articles['NewsroomTopic'] != "User"]
articles = articles.loc[~articles['Title'].str.contains('derStandard.at', case=False, na=False)]

#filter out very short articles (less than the overlap threshold)
articles['num_tokens'] = articles['body_text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)))
print('token sequence warning can be ignored')
print('-> chunk_text will be applied to cut it down before feeding it to roberta')
articles = articles[articles['num_tokens'] >= overlap]

#keep only relevant columns
articles = articles[['ID_Article', 'publishingDate', 'body_text', 'NewsroomTopic']]
len(articles)

Token indices sequence length is longer than the specified maximum sequence length for this model (1285 > 512). Running this sequence through the model will result in indexing errors


-> chunk_text will be applied to cut it down before feeding it to roberta


10135

Prepare and Analyze Articles

In [33]:
articles = chunk_text(articles, 'body_text')
len(articles)

18747

In [34]:
articles = sentiment_analysis(articles, 'body_text')
articles

GPU


Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,10,2015-02-07 17:00:00.00,Community-Quiz #2: Können Sie die Postings in ...,Wissenschaft,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777
1,12,2015-03-20 15:11:50.00,Android Auto geht in den USA an den Start Zuge...,Web,0.011063,0.000795,0.000113,0.000589,0.009917,0.977524
2,15,2015-05-31 12:17:05.01,US-Außenminister Kerry nach Fahrradunfall im K...,International,0.112966,0.004229,0.001921,0.058150,0.003372,0.819363
3,16,2015-05-31 12:09:51.00,Hundefleisch wird in Südkorea immer unbeliebte...,Panorama,0.000043,0.999875,0.000018,0.000027,0.000012,0.000025
4,16,2015-05-31 12:09:51.00,nen. Für Kang Dae-in ist das koreanische Leibg...,Panorama,0.000063,0.999839,0.000033,0.000020,0.000014,0.000030
...,...,...,...,...,...,...,...,...,...,...
18742,12084,2016-05-31 17:39:29.00,Hier ist ein signifikanter sechsstelliger Betr...,Etat,0.000189,0.999512,0.000014,0.000031,0.000024,0.000231
18743,12084,2016-05-31 17:39:29.00,"auf rund 3,3 Milliarden Euro. Das digitale Ges...",Etat,0.047127,0.017818,0.000134,0.005807,0.006039,0.923074
18744,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur,0.025573,0.969986,0.001066,0.001175,0.000113,0.002087
18745,12086,2016-05-31 18:08:20.00,"""Humungus"" Hradil, Mitbegründer der Wrestling ...",Kultur,0.691675,0.029701,0.002929,0.012237,0.001057,0.262402


In [35]:
articles = combine_chunks(articles, 'body_text', 'ID_Article', articles_meta)
len(articles)

10135

Descriptive Statistics for Articles

In [36]:
#compute statistics for each emotion across all articles
articles[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.112976,0.554969,0.006683,0.026338,0.017474,0.28156
std,0.179039,0.423591,0.042758,0.098661,0.087289,0.333567
min,2.4e-05,3.4e-05,8e-06,1.3e-05,1e-05,1.8e-05
max,0.972735,0.999905,0.970016,0.999526,0.999089,0.996737


In [37]:
#count articles by dominant emotion
article_counts = pd.crosstab(
    articles['NewsroomTopic'], 
    articles['dominant_emotion'], 
    margins=True,
    margins_name='Total'
)
article_counts

dominant_emotion,Anger,Disgust,Fear,Joy,None,Sadness,Total
NewsroomTopic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Etat,88,3,272,9,278,11,661
Inland,102,0,646,6,254,2,1010
International,69,5,1036,1,353,14,1478
Kultur,47,1,339,8,100,43,538
Panorama,97,28,1191,11,302,43,1672
Sport,48,1,345,70,689,47,1200
Web,152,0,988,9,498,4,1651
Wirtschaft,168,0,797,1,383,3,1352
Wissenschaft,10,0,427,5,128,3,573
Total,781,38,6041,120,2985,170,10135


In [None]:
descriptives_topic(articles)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.155633,0.227016,0.387837,0.41985,0.006689,0.050097,0.026612,0.097371,0.020547,0.092876,0.402682,0.378613
1,Inland,0.14824,0.205893,0.580706,0.402648,0.001151,0.007553,0.006997,0.046386,0.007373,0.050454,0.255532,0.317273
2,International,0.096798,0.147578,0.650944,0.39481,0.007987,0.039563,0.018397,0.073133,0.003301,0.021266,0.222573,0.292456
3,Kultur,0.097522,0.187946,0.589572,0.424986,0.006078,0.035144,0.086001,0.2187,0.018852,0.089844,0.201975,0.315486
4,Panorama,0.093235,0.146245,0.657814,0.387449,0.023869,0.085879,0.034309,0.111994,0.009928,0.07513,0.180845,0.250037
5,Sport,0.099112,0.136686,0.262836,0.360788,0.001502,0.010438,0.070651,0.138244,0.07662,0.17825,0.489279,0.33639
6,Web,0.112281,0.195665,0.565782,0.431382,0.002176,0.019737,0.006037,0.041528,0.012611,0.066225,0.301112,0.363152
7,Wirtschaft,0.154308,0.210655,0.550291,0.421463,0.000433,0.000948,0.006897,0.038763,0.003418,0.02418,0.284653,0.325801
8,Wissenschaft,0.048959,0.111776,0.713931,0.394595,0.002073,0.011966,0.012887,0.055129,0.012324,0.076805,0.209827,0.32175


# Comments

In [17]:
#read user posts from the database
posts = pd.read_sql_query("SELECT * FROM Posts", con)

#save metadata columns
posts_meta = ['ID_Post', 'ID_Article', 'CreatedAt', 'NewsroomTopic']

#add article topics to posts
posts['NewsroomTopic'] = posts['ID_Article'].map(articles.set_index('ID_Article')['NewsroomTopic'])
posts

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes,NewsroomTopic
0,1,,1,9089,2003-04-23 14:52:41.870,deleted,,,0,0,
1,2,,1,29367,2003-11-04 16:21:57.850,online,"Newsletter ""DER STANDARD""",Ich bin begeistert von den STANDARD - Newslett...,0,0,
2,3,2.0,1,5095,2004-01-28 12:57:28.240,deleted,Auch begeistert...,... Aber momentan funktioniert das Abmelden od...,0,0,
3,4,3.0,1,1682,2004-02-03 20:32:39.123,deleted,Abmeldeprobleme,Es ist ganz einfach nervend!\r\nVor kurzem hab...,0,0,
4,5,,1,3343,2004-03-02 11:37:44.100,online,,und sie als mitarbeiter sind natuerlich objektiv,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
1011768,1011769,1011764.0,12087,6355,2016-06-01 23:10:29.003,online,,zwischen der beendigung eines arbeitsverhältni...,0,0,Kultur
1011769,1011770,1011765.0,12087,6355,2016-06-01 23:11:14.790,online,,du sagst es ja im namen selbst: wegen eindicku...,1,0,Kultur
1011770,1011771,1011770.0,12087,27023,2016-06-02 08:16:56.690,online,,Was genau haben Sie denn nicht verstanden? Ich...,0,1,Kultur
1011771,1011772,1011769.0,12087,19159,2016-06-02 09:12:09.993,online,,irgendwie widersprechen Sie sich in Ihrem Post...,1,0,Kultur


Apply Filters

In [18]:
#keep only posts to existing articles
posts = posts[posts['ID_Article'].isin(articles['ID_Article'])]

#drop posts with missing text or topic
posts = posts.dropna(subset=['Body','NewsroomTopic'])

#remove posts that are empty strings
posts = posts[posts['Body'].astype(str).str.strip() != '']

#keep only top-level posts - no replies
posts = posts[posts['ID_Parent_Post'].isna()]

#select relevant columns
posts = posts[['ID_Post','ID_Article','CreatedAt','Body', 'NewsroomTopic']]
len(posts)

249993

Prepare and Analyze Comments

In [None]:
posts = chunk_text(posts, 'Body')
len(posts)

249994

In [20]:
posts = sentiment_analysis(posts, 'Body')
posts

GPU


Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,2084,10,2015-02-07 17:11:43.933,ich soll also mehr mitposten.,Wissenschaft,0.043411,0.000065,0.000451,0.001876,0.002159,0.952038
1,2086,10,2015-02-07 17:29:14.757,Höhö 7 von 10,Wissenschaft,0.397729,0.000053,0.004970,0.044212,0.052708,0.500328
2,2088,10,2015-02-07 17:30:52.763,"Buh, 1 Prozent unter dem Durchschnitt. Ein Glü...",Wissenschaft,0.007702,0.000112,0.000257,0.001685,0.945750,0.044493
3,2091,10,2015-02-07 17:44:51.023,8 von 10. Alles geraten.,Wissenschaft,0.786219,0.000142,0.047586,0.049289,0.003276,0.113487
4,2092,10,2015-02-07 17:55:30.990,die fpö hat mich rausgerissen,Wissenschaft,0.988217,0.000050,0.003037,0.001703,0.001363,0.005629
...,...,...,...,...,...,...,...,...,...,...,...
249989,1010994,12083,2016-05-31 17:36:21.730,"""Singer-Songwriter"" ist keine Neuerfindung des...",Kultur,0.435494,0.000810,0.000325,0.006520,0.002052,0.554800
249990,1010995,12083,2016-06-01 05:24:53.540,Bei ordentlichen medien gibts dann immer einen...,Kultur,0.989500,0.000192,0.000359,0.001076,0.000396,0.008478
249991,1010997,12084,2016-05-31 17:54:24.000,"Die User haben hier, haben es doch oft formuli...",Etat,0.933450,0.000109,0.000475,0.003944,0.001764,0.060259
249992,1011760,12086,2016-05-31 21:10:42.487,Na ja. Und wozu?,Kultur,0.618691,0.000254,0.000950,0.003175,0.003138,0.373792


In [21]:
posts = combine_chunks(posts, 'Body', 'ID_Post', posts_meta)
len(posts)

249993

Descriptive Statistics for Comments

In [22]:
#compute statistics for each emotion across all posts
posts[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.513091,0.177979,0.016756,0.029754,0.057666,0.204753
std,0.428868,0.363919,0.109419,0.130694,0.211276,0.327005
min,2.2e-05,5e-06,7e-06,1.1e-05,8e-06,1.5e-05
max,0.997343,0.99991,0.999891,0.999756,0.999759,0.997946


In [23]:
#count posts by dominant emotion
post_counts = pd.crosstab(
    articles['NewsroomTopic'], 
    articles['dominant_emotion'], 
    margins=True,
    margins_name='Total'
)
post_counts

dominant_emotion,Anger,Disgust,Fear,Joy,None,Sadness,Total
NewsroomTopic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Etat,88,3,272,9,278,11,661
Inland,102,0,646,6,254,2,1010
International,69,5,1036,1,353,14,1478
Kultur,47,1,339,8,100,43,538
Panorama,97,28,1191,11,302,43,1672
Sport,48,1,345,70,689,47,1200
Web,152,0,988,9,498,4,1651
Wirtschaft,168,0,797,1,383,3,1352
Wissenschaft,10,0,427,5,128,3,573
Total,781,38,6041,120,2985,170,10135


In [None]:
descriptives_topic(posts)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.541267,0.430733,0.153285,0.342304,0.027006,0.141527,0.029132,0.130534,0.063464,0.22432,0.185846,0.316335
1,Inland,0.557449,0.428024,0.179459,0.364614,0.015568,0.107051,0.023272,0.114466,0.040683,0.176471,0.183569,0.314074
2,International,0.52995,0.426726,0.20078,0.381546,0.016632,0.106424,0.025732,0.118137,0.032241,0.156901,0.194665,0.319305
3,Kultur,0.353034,0.411289,0.12525,0.315564,0.023955,0.127023,0.095912,0.250321,0.156809,0.338314,0.24504,0.359848
4,Panorama,0.514373,0.429906,0.210497,0.389063,0.021448,0.123401,0.029118,0.129538,0.040048,0.176455,0.184517,0.312885
5,Sport,0.434888,0.420335,0.097942,0.28105,0.012029,0.089873,0.051223,0.170103,0.138488,0.315409,0.26543,0.356673
6,Web,0.49391,0.42408,0.166329,0.352997,0.017461,0.113525,0.022846,0.110463,0.06448,0.221435,0.234974,0.342883
7,Wirtschaft,0.553817,0.427346,0.205796,0.383895,0.008793,0.078621,0.018196,0.099226,0.032627,0.157097,0.180771,0.309504
8,Wissenschaft,0.430837,0.420606,0.169238,0.35903,0.027245,0.14222,0.038727,0.149611,0.074918,0.240915,0.259034,0.35948


Prepare File for further Analysis

In [None]:
#rename emotion columns to distinguish between article and post emotions
articles.rename(columns={
    "Anger": "Anger_A",
    "Fear": "Fear_A",
    "Disgust": "Disgust_A",
    "Sadness": "Sadness_A",
    "Joy": "Joy_A",
    "None": "None_A",
    "dominant_emotion": "Dominant_Emotion_A"
}, inplace=True)

posts.rename(columns={
    "Anger": "Anger_P",
    "Fear": "Fear_P",
    "Disgust": "Disgust_P",
    "Sadness": "Sadness_P",
    "Joy": "Joy_P",
    "None": "None_P",
    "dominant_emotion": "Dominant_Emotion_P"
}, inplace=True)

In [None]:
#merge articles and posts on ID_article and NewsroomTopic
data = pd.merge(articles, posts, on=["ID_Article", "NewsroomTopic"], how="right")

#keep only columns with relevant information
data = data[[
    "ID_Post", "CreatedAt", "Anger_P", "Fear_P", "Disgust_P", "Sadness_P", "Joy_P", "None_P", "Dominant_Emotion_P", "NewsroomTopic",
    "ID_Article", "publishingDate", "Anger_A", "Fear_A", "Disgust_A", "Sadness_A", "Joy_A", "None_A", "Dominant_Emotion_A"
]]
data

Unnamed: 0,ID_Post,CreatedAt,Anger_P,Fear_P,Disgust_P,Sadness_P,Joy_P,None_P,Dominant_Emotion_P,NewsroomTopic,ID_Article,publishingDate,Anger_A,Fear_A,Disgust_A,Sadness_A,Joy_A,None_A,Dominant_Emotion_A
0,2084,2015-02-07 17:11:43.933,0.043411,0.000065,0.000451,0.001876,0.002159,0.952038,,Wissenschaft,10,2015-02-07 17:00:00.00,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777,
1,2086,2015-02-07 17:29:14.757,0.397729,0.000053,0.004970,0.044212,0.052708,0.500328,,Wissenschaft,10,2015-02-07 17:00:00.00,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777,
2,2088,2015-02-07 17:30:52.763,0.007702,0.000112,0.000257,0.001685,0.945750,0.044493,Joy,Wissenschaft,10,2015-02-07 17:00:00.00,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777,
3,2091,2015-02-07 17:44:51.023,0.786219,0.000142,0.047586,0.049289,0.003276,0.113487,Anger,Wissenschaft,10,2015-02-07 17:00:00.00,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777,
4,2092,2015-02-07 17:55:30.990,0.988217,0.000050,0.003037,0.001703,0.001363,0.005629,Anger,Wissenschaft,10,2015-02-07 17:00:00.00,0.026392,0.019651,0.000412,0.001573,0.016196,0.935777,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249988,1010994,2016-05-31 17:36:21.730,0.435494,0.000810,0.000325,0.006520,0.002052,0.554800,,Kultur,12083,2016-05-31 16:14:13.00,0.070813,0.541406,0.000827,0.082062,0.002738,0.302154,Fear
249989,1010995,2016-06-01 05:24:53.540,0.989500,0.000192,0.000359,0.001076,0.000396,0.008478,Anger,Kultur,12083,2016-05-31 16:14:13.00,0.070813,0.541406,0.000827,0.082062,0.002738,0.302154,Fear
249990,1010997,2016-05-31 17:54:24.000,0.933450,0.000109,0.000475,0.003944,0.001764,0.060259,Anger,Etat,12084,2016-05-31 17:39:29.00,0.002561,0.948136,0.000018,0.000343,0.000334,0.048607,Fear
249991,1011760,2016-05-31 21:10:42.487,0.618691,0.000254,0.000950,0.003175,0.003138,0.373792,Anger,Kultur,12086,2016-05-31 18:08:20.00,0.215356,0.702083,0.001597,0.004327,0.000382,0.076255,Fear


Save Data as a CSV

In [None]:
#save the combined articles and posts dataset with sentiment scores to CSV
#data.to_csv("sentiment_results.csv", index=False)