In [23]:
import sqlite3
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [26]:
con = sqlite3.connect('corpus.sqlite3')
model_path = "RobertaGerman"
device = "cpu"
batch_size = 16
overlap = 64

In [None]:
max_length = 512
labels = {
    "LABEL_0": "Anger",
    "LABEL_1": "Fear",
    "LABEL_2": "Disgust",
    "LABEL_3": "Sadness",
    "LABEL_4": "Joy",
    "LABEL_5": "None"
}
emotions = list(labels.values())
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [3]:
def chunk_text(df, text_col):
    df[text_col] = df[text_col].str.strip()
    chunked_rows = []
    for _, row in df.iterrows():
        text = str(row[text_col])
        token_ids = tokenizer.encode(text, add_special_tokens=False)

        start = 0
        while start < len(token_ids):
            end = start + max_length
            chunk_ids = token_ids[start:end]
            chunk_text = tokenizer.decode(chunk_ids)

            chunk_row = row.copy()
            chunk_row[text_col] = chunk_text
            chunked_rows.append(chunk_row)

            if end >= len(token_ids):
                break
            start += max_length - overlap
    return pd.DataFrame(chunked_rows).reset_index(drop=True)

In [4]:
def sentiment_analysis(df, column):
    all_scores = []
    for b in range(0, len(df), batch_size):
        batch_texts = df[column].iloc[b:b + batch_size].tolist()

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
            add_special_tokens=True
        )

        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        for p in probs:
            score_dict = {}
            for i, label_key in enumerate(labels.keys()):
                emotion_name = labels[label_key]
                score_dict[emotion_name] = float(p[i])
            all_scores.append(score_dict)

    df_scores = pd.DataFrame(all_scores)
    return pd.concat([df.reset_index(drop=True), df_scores], axis=1)

In [5]:
def combine_chunks(df, text, group, metadata):
    df['chunk_tokens'] = df[text].apply(
        lambda x: len(tokenizer.encode(str(x), add_special_tokens=False))
    )

    weighted_rows = []
    for group_id, group_df in df.groupby(group):
        total_tokens = group_df['chunk_tokens'].sum()
        row = {group: group_id}

        for emo in emotions:
            row[emo] = (
                (group_df[emo] * group_df['chunk_tokens']).sum() / total_tokens
                if total_tokens > 0 else np.nan
            )

        weighted_rows.append(row)

    weighed_df = pd.DataFrame(weighted_rows)
    weighed_df['dominant_emotion'] = weighed_df[emotions].idxmax(axis=1)
    meta_df = df[metadata].drop_duplicates(subset=group)
    return weighed_df.merge(meta_df, on=group, how='left')

In [6]:
def descriptive_statistics(df):
    topic_dict = {}
    for topic, group in df.groupby('NewsroomTopic'):
        descriptives = {}
        for e in labels.values():
            descriptives[f"{e}_mean"] = group[e].mean()
            descriptives[f"{e}_std"] = group[e].std()
        topic_dict[topic] = descriptives
        
    table = pd.DataFrame.from_dict(topic_dict, orient='index')
    return table.reset_index().rename(columns={'index': 'NewsroomTopic'})

Articles

In [7]:
articles = pd.read_sql_query("SELECT * FROM Articles", con)
articles_meta = ['ID_Article', 'publishingDate', 'NewsroomTopic']
articles['body_text'] = articles.apply(
    lambda row: str(row['Title']) + " " +
    BeautifulSoup(str(row['Body']), "html.parser").get_text(),
    axis=1)
articles

Unnamed: 0,ID_Article,Path,publishingDate,Title,Body,body_text
0,1,Newsroom/User/Community,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Newsletter von derStandard.at Abonnieren S...
1,2,Newsroom/User/Community/Regeln,2012-05-26 12:12:19.46,Werden Sie Teil von derStandard.at!,"<div class=""diashow"" id=""objectContent""><meta ...",Werden Sie Teil von derStandard.at! Werden Sie...
2,3,Diverses/mobil,2013-11-22 12:15:00.00,Die Android App von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Android App von derStandard.at Die Smartph...
3,4,Newsroom/User/mitmachen/Mitreden,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,"<div class=""section"" id=""content-main"" itempro...",Welche Erfahrungen haben Sie als Linkshänder g...
4,5,Newsroom/User/mitmachen/Mitreden,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt?,"<div class=""section"" id=""content-main"" itempro...",Wie haben Sie das Jahr 1989 erlebt? Erzählen S...
...,...,...,...,...,...,...
12082,12083,Newsroom/Kultur/Musikkultur,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule,"<div class=""section"" id=""content-main"" itempro...",Max Prosa: Junger Troubadour alter Schule Von ...
12083,12084,Newsroom/Etat/PRINT/Springer,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen""","<div class=""section"" id=""content-main"" itempro...","""Können Adblocker nicht einfach hinnehmen"" Med..."
12084,12085,Meinung/Kolumnen/rau,2016-05-31 17:34:54.00,Die Rechten machen Facebook zum Hatebook,"<div class=""section"" id=""content-main"" itempro...",Die Rechten machen Facebook zum Hatebook Die F...
12085,12086,Newsroom/Kultur/Buehne,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...","<div class=""section"" id=""content-main"" itempro...","""Wrestling Rita"": Feministischer Punktsieg im ..."


In [8]:
articles['MainPath'] = articles['Path'].apply(lambda x: str(x).split('/')[0])
articles = articles[articles['MainPath'] == 'Newsroom']
articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])

articles = articles.loc[~articles['Title'].str.contains('derStandard.at', case=False, na=False)]

articles['num_tokens'] = articles['body_text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)))
articles = articles[articles['num_tokens'] >= overlap]

articles = articles[['ID_Article', 'publishingDate', 'body_text', 'NewsroomTopic']]
articles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])
Token indices sequence length is longer than the specified maximum sequence length for this model (1550 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
3,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User
4,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User
5,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User
6,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User
7,8,2014-11-13 10:43:36.00,"Was wollten Sie als Kind werden? ""Ich will Feu...",User
...,...,...,...,...
12080,12081,2016-05-31 17:07:43.00,Niederösterreich: Zollfahnder beschlagnahmten ...,Panorama
12082,12083,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule Von ...,Kultur
12083,12084,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen"" Med...",Etat
12085,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur


In [9]:
articles = chunk_text(articles, 'body_text')
articles

Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
0,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User
1,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User
2,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User
3,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User
4,7,2014-09-26 14:00:11.00,3) – übrigens schaffte es das Ursprungsposting...,User
...,...,...,...,...
19055,12084,2016-05-31 17:39:29.00,Hier ist ein signifikanter sechsstelliger Betr...,Etat
19056,12084,2016-05-31 17:39:29.00,"auf rund 3,3 Milliarden Euro. Das digitale Ges...",Etat
19057,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur
19058,12086,2016-05-31 18:08:20.00,"""Humungus"" Hradil, Mitbegründer der Wrestling ...",Kultur


In [10]:
articles = sentiment_analysis(articles, 'body_text')
articles

Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:837.)
  score_dict[emotion_name] = float(p[i])


Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033
1,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User,0.000417,0.993431,0.000040,0.005090,0.000076,0.000946
2,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User,0.000465,0.999214,0.000106,0.000086,0.000019,0.000109
3,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User,0.000162,0.999710,0.000046,0.000017,0.000016,0.000048
4,7,2014-09-26 14:00:11.00,3) – übrigens schaffte es das Ursprungsposting...,User,0.000044,0.999866,0.000015,0.000021,0.000017,0.000037
...,...,...,...,...,...,...,...,...,...,...
19055,12084,2016-05-31 17:39:29.00,Hier ist ein signifikanter sechsstelliger Betr...,Etat,0.000188,0.999511,0.000014,0.000031,0.000024,0.000232
19056,12084,2016-05-31 17:39:29.00,"auf rund 3,3 Milliarden Euro. Das digitale Ges...",Etat,0.046454,0.016545,0.000133,0.005761,0.005960,0.925146
19057,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur,0.026282,0.969193,0.001090,0.001196,0.000115,0.002123
19058,12086,2016-05-31 18:08:20.00,"""Humungus"" Hradil, Mitbegründer der Wrestling ...",Kultur,0.695502,0.030448,0.002946,0.012074,0.001049,0.257981


In [11]:
articles = combine_chunks(articles, 'body_text', 'ID_Article', articles_meta)
articles

Unnamed: 0,ID_Article,Anger,Fear,Disgust,Sadness,Joy,None,dominant_emotion,publishingDate,NewsroomTopic
0,4,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear,2014-08-13 05:30:00.00,User
1,5,0.000417,0.993431,0.000040,0.005090,0.000076,0.000946,Fear,2014-08-27 12:27:01.09,User
2,6,0.000465,0.999214,0.000106,0.000086,0.000019,0.000109,Fear,2014-09-30 09:56:00.00,User
3,7,0.000120,0.999755,0.000033,0.000022,0.000017,0.000053,Fear,2014-09-26 14:00:11.00,User
4,8,0.000518,0.998829,0.000021,0.000210,0.000037,0.000385,Fear,2014-11-13 10:43:36.00,User
...,...,...,...,...,...,...,...,...,...,...
10280,12081,0.679481,0.104901,0.029647,0.004698,0.002477,0.178796,Anger,2016-05-31 17:07:43.00,Panorama
10281,12083,0.071449,0.541087,0.000829,0.081418,0.002716,0.302501,Fear,2016-05-31 16:14:13.00,Kultur
10282,12084,0.002525,0.948069,0.000018,0.000341,0.000330,0.048716,Fear,2016-05-31 17:39:29.00,Etat
10283,12086,0.216954,0.701729,0.001619,0.004295,0.000381,0.075021,Fear,2016-05-31 18:08:20.00,Kultur


In [12]:
articles[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.114215,0.55648,0.006595,0.026078,0.017681,0.278951
std,0.182432,0.423747,0.042423,0.098105,0.08856,0.333005
min,2.4e-05,3.5e-05,8e-06,1.3e-05,1e-05,1.8e-05
max,0.972973,0.999905,0.969769,0.999524,0.999088,0.996742


In [13]:
articles['dominant_emotion'].value_counts()

dominant_emotion
Fear       6143
None       2996
Anger       811
Sadness     171
Joy         126
Disgust      38
Name: count, dtype: int64

In [14]:
descriptive_statistics(articles)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.155756,0.227251,0.387891,0.4199,0.006659,0.049958,0.02659,0.097378,0.020485,0.09272,0.402619,0.378652
1,Inland,0.148276,0.205923,0.580744,0.402604,0.001146,0.007498,0.006993,0.04638,0.007355,0.050355,0.255486,0.317245
2,International,0.096814,0.147663,0.650862,0.394857,0.007978,0.039559,0.018404,0.073167,0.003293,0.021249,0.222649,0.292508
3,Kultur,0.097575,0.188093,0.589782,0.425044,0.006063,0.035158,0.085896,0.218698,0.01876,0.089542,0.201923,0.315508
4,Panorama,0.093236,0.146296,0.657816,0.387466,0.023849,0.085822,0.034329,0.112026,0.009913,0.075088,0.180857,0.250023
5,Sport,0.099133,0.136719,0.263098,0.360893,0.001497,0.010355,0.070587,0.138173,0.076447,0.177981,0.489238,0.3364
6,User,0.196836,0.329236,0.654835,0.424453,0.001394,0.00997,0.00907,0.044216,0.034862,0.156459,0.103003,0.231855
7,Web,0.112246,0.195664,0.565874,0.431371,0.002162,0.019602,0.006046,0.041603,0.01255,0.066053,0.301121,0.363224
8,Wirtschaft,0.154346,0.210707,0.550272,0.421438,0.000432,0.000945,0.006895,0.038794,0.003407,0.024152,0.284648,0.325794
9,Wissenschaft,0.048916,0.111778,0.71402,0.394596,0.002066,0.01193,0.012909,0.055221,0.012303,0.076752,0.209786,0.321799


Posts

In [15]:
posts = pd.read_sql_query("SELECT * FROM Posts", con)
posts_meta = ['ID_Post', 'ID_Article', 'CreatedAt', 'NewsroomTopic']
posts['NewsroomTopic'] = posts['ID_Article'].map(articles.set_index('ID_Article')['NewsroomTopic'])
posts

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes,NewsroomTopic
0,1,,1,9089,2003-04-23 14:52:41.870,deleted,,,0,0,
1,2,,1,29367,2003-11-04 16:21:57.850,online,"Newsletter ""DER STANDARD""",Ich bin begeistert von den STANDARD - Newslett...,0,0,
2,3,2.0,1,5095,2004-01-28 12:57:28.240,deleted,Auch begeistert...,... Aber momentan funktioniert das Abmelden od...,0,0,
3,4,3.0,1,1682,2004-02-03 20:32:39.123,deleted,Abmeldeprobleme,Es ist ganz einfach nervend!\r\nVor kurzem hab...,0,0,
4,5,,1,3343,2004-03-02 11:37:44.100,online,,und sie als mitarbeiter sind natuerlich objektiv,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
1011768,1011769,1011764.0,12087,6355,2016-06-01 23:10:29.003,online,,zwischen der beendigung eines arbeitsverhältni...,0,0,Kultur
1011769,1011770,1011765.0,12087,6355,2016-06-01 23:11:14.790,online,,du sagst es ja im namen selbst: wegen eindicku...,1,0,Kultur
1011770,1011771,1011770.0,12087,27023,2016-06-02 08:16:56.690,online,,Was genau haben Sie denn nicht verstanden? Ich...,0,1,Kultur
1011771,1011772,1011769.0,12087,19159,2016-06-02 09:12:09.993,online,,irgendwie widersprechen Sie sich in Ihrem Post...,1,0,Kultur


In [16]:
posts = posts[posts['ID_Article'].isin(articles['ID_Article'])]
posts = posts.dropna()
posts = posts[posts['Body'].astype(str).str.strip() != '']
posts = posts[['ID_Post','ID_Article','CreatedAt','Body', 'NewsroomTopic']]
posts

Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body,NewsroomTopic
190,191,4,2014-08-13 06:29:59.987,Eben diese Erfahrung wollte ich auch teilen. D...,User
196,197,4,2014-08-13 07:23:13.860,...mit der Linkshändigkeit zu tun?,User
197,198,4,2014-08-13 07:26:24.743,...der Klingen? Und präzises Schneiden mit ein...,User
200,201,4,2014-08-13 07:33:33.407,"Ich kenn deutlich mehr Linkshänder, die aufm S...",User
201,202,4,2014-08-13 07:34:11.377,"# Besteckhalten mach ich genauso, also ""einhän...",User
...,...,...,...,...,...
1010855,1010856,12078,2016-06-02 08:20:08.213,Ein Wirtschaftsstudium hätte Ihnen auch nicht ...,Inland
1010858,1010859,12078,2016-06-02 13:55:13.073,Ich hasse mein dummes 16-jähriges Ich dafür. H...,Inland
1010870,1010871,12079,2016-05-31 22:56:09.220,Was genau ist dein Problem?,Web
1011762,1011763,12087,2016-05-31 23:47:32.010,"und dafür immer Argumente sucht, es nicht zu m...",Kultur


In [17]:
posts = sentiment_analysis(posts,'Body')
posts['dominant_emotion'] = posts[emotions].idxmax(axis=1)
posts

Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None,dominant_emotion
0,191,4,2014-08-13 06:29:59.987,Eben diese Erfahrung wollte ich auch teilen. D...,User,0.000120,0.999689,0.000045,0.000047,0.000020,0.000079,Fear
1,197,4,2014-08-13 07:23:13.860,...mit der Linkshändigkeit zu tun?,User,0.989551,0.000133,0.000688,0.000847,0.000460,0.008320,Anger
2,198,4,2014-08-13 07:26:24.743,...der Klingen? Und präzises Schneiden mit ein...,User,0.108701,0.002560,0.018449,0.005487,0.001450,0.863353,
3,201,4,2014-08-13 07:33:33.407,"Ich kenn deutlich mehr Linkshänder, die aufm S...",User,0.430212,0.008964,0.002008,0.366200,0.002447,0.190168,Anger
4,202,4,2014-08-13 07:34:11.377,"# Besteckhalten mach ich genauso, also ""einhän...",User,0.981684,0.000288,0.000668,0.001571,0.000484,0.015305,Anger
...,...,...,...,...,...,...,...,...,...,...,...,...
132116,1010856,12078,2016-06-02 08:20:08.213,Ein Wirtschaftsstudium hätte Ihnen auch nicht ...,Inland,0.325956,0.000242,0.000464,0.005642,0.053380,0.614316,
132117,1010859,12078,2016-06-02 13:55:13.073,Ich hasse mein dummes 16-jähriges Ich dafür. H...,Inland,0.072507,0.066935,0.003864,0.849980,0.000343,0.006371,Sadness
132118,1010871,12079,2016-05-31 22:56:09.220,Was genau ist dein Problem?,Web,0.386056,0.000832,0.000550,0.003874,0.001406,0.607282,
132119,1011763,12087,2016-05-31 23:47:32.010,"und dafür immer Argumente sucht, es nicht zu m...",Kultur,0.995209,0.000364,0.000361,0.000621,0.000286,0.003159,Anger


In [18]:
posts[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.491233,0.213216,0.013411,0.023097,0.030817,0.228227
std,0.42612,0.390424,0.097055,0.109475,0.150872,0.343662
min,2.2e-05,6e-06,7e-06,1.1e-05,8e-06,1.4e-05
max,0.997274,0.99991,0.99989,0.999747,0.999763,0.998005


In [19]:
posts['dominant_emotion'].value_counts()

dominant_emotion
Anger      67760
None       28543
Fear       28233
Joy         3767
Sadness     2251
Disgust     1567
Name: count, dtype: int64

In [20]:
descriptive_statistics(posts)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.504402,0.426727,0.186374,0.370279,0.025247,0.139414,0.023276,0.11513,0.032983,0.16098,0.227719,0.346766
1,Inland,0.516361,0.428303,0.206627,0.38621,0.01143,0.089975,0.021041,0.103512,0.02575,0.136413,0.21879,0.339531
2,International,0.49749,0.426647,0.229763,0.400815,0.012268,0.089718,0.021421,0.104662,0.02132,0.124565,0.217739,0.338059
3,Kultur,0.42592,0.424721,0.178754,0.366872,0.017802,0.108524,0.051453,0.177733,0.079928,0.244519,0.246143,0.35744
4,Panorama,0.473676,0.426083,0.240642,0.408875,0.017516,0.11109,0.025044,0.115429,0.0255,0.137973,0.217623,0.337459
5,Sport,0.485871,0.418484,0.130685,0.318983,0.009695,0.077482,0.034133,0.12804,0.059614,0.205682,0.280003,0.363363
6,User,0.438504,0.424003,0.235944,0.40649,0.018902,0.120365,0.039579,0.15524,0.051317,0.199677,0.215754,0.335665
7,Web,0.459208,0.422017,0.203926,0.382629,0.015293,0.106449,0.018432,0.094796,0.042992,0.178551,0.260149,0.360851
8,Wirtschaft,0.52412,0.424994,0.204208,0.38221,0.007894,0.073192,0.01755,0.09349,0.02622,0.13809,0.220008,0.337619
9,Wissenschaft,0.40658,0.415734,0.256275,0.420847,0.018971,0.118306,0.026145,0.114907,0.036376,0.163246,0.255653,0.357227


Prepare File

In [21]:
articles.rename(columns={
    "Anger": "anger_a",
    "Fear": "fear_a",
    "Disgust": "disgust_a",
    "Sadness": "sadness_a",
    "Joy": "joy_a",
    "None": "none_a",
    "dominant_emotion": "dominant_emotion_a"
}, inplace=True)

posts.rename(columns={
    "Anger": "anger_p",
    "Fear": "fear_p",
    "Disgust": "disgust_p",
    "Sadness": "sadness_p",
    "Joy": "joy_p",
    "None": "none_p",
    "dominant_emotion": "dominant_emotion_p"
}, inplace=True)

data = pd.merge(articles, posts, on=["ID_Article", "NewsroomTopic"], how="right")
data["publishingDate"] = pd.to_datetime(data['publishingDate'])
data["CreatedAt"] = pd.to_datetime(data['CreatedAt'])

data = data[["ID_Post" ,"CreatedAt", "anger_p", "fear_p", "disgust_p", "sadness_p", "joy_p", "none_p", "dominant_emotion_p", "NewsroomTopic",
             "ID_Article", "publishingDate", "anger_a", "fear_a", "disgust_a", "sadness_a", "joy_a", "none_a", "dominant_emotion_a"]]
data

Unnamed: 0,ID_Post,CreatedAt,anger_p,fear_p,disgust_p,sadness_p,joy_p,none_p,dominant_emotion_p,NewsroomTopic,ID_Article,publishingDate,anger_a,fear_a,disgust_a,sadness_a,joy_a,none_a,dominant_emotion_a
0,191,2014-08-13 06:29:59.987,0.000120,0.999689,0.000045,0.000047,0.000020,0.000079,Fear,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
1,197,2014-08-13 07:23:13.860,0.989551,0.000133,0.000688,0.000847,0.000460,0.008320,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
2,198,2014-08-13 07:26:24.743,0.108701,0.002560,0.018449,0.005487,0.001450,0.863353,,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
3,201,2014-08-13 07:33:33.407,0.430212,0.008964,0.002008,0.366200,0.002447,0.190168,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
4,202,2014-08-13 07:34:11.377,0.981684,0.000288,0.000668,0.001571,0.000484,0.015305,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132116,1010856,2016-06-02 08:20:08.213,0.325956,0.000242,0.000464,0.005642,0.053380,0.614316,,Inland,12078,2016-05-31 14:54:45,0.000040,0.999832,0.000011,0.000066,0.000021,0.000029,Fear
132117,1010859,2016-06-02 13:55:13.073,0.072507,0.066935,0.003864,0.849980,0.000343,0.006371,Sadness,Inland,12078,2016-05-31 14:54:45,0.000040,0.999832,0.000011,0.000066,0.000021,0.000029,Fear
132118,1010871,2016-05-31 22:56:09.220,0.386056,0.000832,0.000550,0.003874,0.001406,0.607282,,Web,12079,2016-05-31 09:43:18,0.000158,0.999671,0.000029,0.000017,0.000025,0.000100,Fear
132119,1011763,2016-05-31 23:47:32.010,0.995209,0.000364,0.000361,0.000621,0.000286,0.003159,Anger,Kultur,12087,2016-05-31 15:46:08,0.499322,0.110606,0.000751,0.007877,0.001192,0.380252,Anger


In [None]:
#data.to_csv("combined_sentiments.csv", index=False)