In [1]:
import sqlite3
import pandas as pd
import torch
from bs4 import BeautifulSoup
from transformers import AutoTokenizer
from optimum.intel.openvino import OVModelForSequenceClassification

In [2]:
con = sqlite3.connect('corpus.sqlite3')
model_path = "RobertaGerman"
device = "GPU"
batch_size = 32

In [3]:
max_length = 512
overlap = 128
labels = {
    "LABEL_0": "Anger",
    "LABEL_1": "Fear",
    "LABEL_2": "Disgust",
    "LABEL_3": "Sadness",
    "LABEL_4": "Joy",
    "LABEL_5": "None"
}
emotions = list(labels.values())
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = OVModelForSequenceClassification.from_pretrained(model_path, device=device)


No OpenVINO files were found for RobertaGerman, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


In [4]:
def chunk_text(df, text_col):
    df[text_col] = df[text_col].str.strip()
    chunked_rows = []
    for _, row in df.iterrows():
        text = str(row[text_col])
        token_ids = tokenizer.encode(text, add_special_tokens=False)

        start = 0
        while start < len(token_ids):
            end = start + max_length
            chunk_ids = token_ids[start:end]
            chunk_text = tokenizer.decode(chunk_ids)

            chunk_row = row.copy()
            chunk_row[text_col] = chunk_text
            chunked_rows.append(chunk_row)

            if end >= len(token_ids):
                break
            start += max_length - overlap
    return pd.DataFrame(chunked_rows).reset_index(drop=True)

In [5]:
def sentiment_analysis(df, column):
    all_scores = []
    for b in range(0, len(df), batch_size):
        batch_texts = df[column].iloc[b:b + batch_size].tolist()

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
            add_special_tokens=True
        )

        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        for p in probs:
            score_dict = {}
            for i, label_key in enumerate(labels.keys()):
                emotion_name = labels[label_key]
                score_dict[emotion_name] = float(p[i])
            all_scores.append(score_dict)

    df_scores = pd.DataFrame(all_scores)
    return pd.concat([df.reset_index(drop=True), df_scores], axis=1)


In [6]:
def combine_chunks(df, text, group_col, metadata):
    df['chunk_tokens'] = df[text].apply(
        lambda x: len(tokenizer.encode(str(x), add_special_tokens=False))
    )

    weighted_rows = []
    for group_id, group_df in df.groupby(group_col):
        total_tokens = group_df['chunk_tokens'].sum()
        row = {group_col: group_id}

        for emo in emotions:
            row[emo] = (
                (group_df[emo] * group_df['chunk_tokens']).sum() / total_tokens
                if total_tokens > 0 else np.nan
            )

        weighted_rows.append(row)

    weighed_df = pd.DataFrame(weighted_rows)
    weighed_df['dominant_emotion'] = weighed_df[emotions].idxmax(axis=1)
    meta_df = df[metadata].drop_duplicates(subset=group_col)
    return weighed_df.merge(meta_df, on=group_col, how='left')

In [7]:
def descriptive_statistics(df):
    topic_dict = {}
    for topic, group in df.groupby('NewsroomTopic'):
        descriptives = {}
        for e in labels.values():
            descriptives[f"{e}_mean"] = group[e].mean()
            descriptives[f"{e}_std"] = group[e].std()
        topic_dict[topic] = descriptives
        
    table = pd.DataFrame.from_dict(topic_dict, orient='index')
    return table.reset_index().rename(columns={'index': 'NewsroomTopic'})

Articles

In [8]:
articles = pd.read_sql_query("SELECT * FROM Articles", con)
articles_meta = ['ID_Article', 'publishingDate', 'NewsroomTopic']
articles['body_text'] = articles.apply(
    lambda row: str(row['Title']) + " " +
    BeautifulSoup(str(row['Body']), "html.parser").get_text(),
    axis=1)
articles

Unnamed: 0,ID_Article,Path,publishingDate,Title,Body,body_text
0,1,Newsroom/User/Community,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Newsletter von derStandard.at Abonnieren S...
1,2,Newsroom/User/Community/Regeln,2012-05-26 12:12:19.46,Werden Sie Teil von derStandard.at!,"<div class=""diashow"" id=""objectContent""><meta ...",Werden Sie Teil von derStandard.at! Werden Sie...
2,3,Diverses/mobil,2013-11-22 12:15:00.00,Die Android App von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Android App von derStandard.at Die Smartph...
3,4,Newsroom/User/mitmachen/Mitreden,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,"<div class=""section"" id=""content-main"" itempro...",Welche Erfahrungen haben Sie als Linkshänder g...
4,5,Newsroom/User/mitmachen/Mitreden,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt?,"<div class=""section"" id=""content-main"" itempro...",Wie haben Sie das Jahr 1989 erlebt? Erzählen S...
...,...,...,...,...,...,...
12082,12083,Newsroom/Kultur/Musikkultur,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule,"<div class=""section"" id=""content-main"" itempro...",Max Prosa: Junger Troubadour alter Schule Von ...
12083,12084,Newsroom/Etat/PRINT/Springer,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen""","<div class=""section"" id=""content-main"" itempro...","""Können Adblocker nicht einfach hinnehmen"" Med..."
12084,12085,Meinung/Kolumnen/rau,2016-05-31 17:34:54.00,Die Rechten machen Facebook zum Hatebook,"<div class=""section"" id=""content-main"" itempro...",Die Rechten machen Facebook zum Hatebook Die F...
12085,12086,Newsroom/Kultur/Buehne,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...","<div class=""section"" id=""content-main"" itempro...","""Wrestling Rita"": Feministischer Punktsieg im ..."


In [9]:
articles['MainPath'] = articles['Path'].apply(lambda x: str(x).split('/')[0])
articles = articles[articles['MainPath'] == 'Newsroom'].copy()
articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])

articles = articles.loc[~articles['Title'].str.contains('derStandard.at', case=False, na=False)]

articles['num_tokens'] = articles['body_text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)))
articles = articles[articles['num_tokens'] >= overlap]

articles = articles[['ID_Article', 'publishingDate', 'body_text', 'NewsroomTopic']]
articles

Token indices sequence length is longer than the specified maximum sequence length for this model (1550 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
3,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User
4,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User
5,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User
6,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User
7,8,2014-11-13 10:43:36.00,"Was wollten Sie als Kind werden? ""Ich will Feu...",User
...,...,...,...,...
12080,12081,2016-05-31 17:07:43.00,Niederösterreich: Zollfahnder beschlagnahmten ...,Panorama
12082,12083,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule Von ...,Kultur
12083,12084,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen"" Med...",Etat
12085,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur


In [10]:
articles = chunk_text(articles, 'body_text')
articles


Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
0,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User
1,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User
2,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User
3,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User
4,7,2014-09-26 14:00:11.00,Inhalten fluten (Bild 2). Doch wie reagiert ma...,User
...,...,...,...,...
19801,12084,2016-05-31 17:39:29.00,"nicht erwarten, dass sie weder Geld zahlen noc...",Etat
19802,12084,2016-05-31 17:39:29.00,igenden Ergebnissen hinter uns. Verwertungsges...,Etat
19803,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur
19804,12086,2016-05-31 18:08:20.00,"seinen, und schon gar nicht unterstützt er ihr...",Kultur


In [11]:
articles = sentiment_analysis(articles, 'body_text')
articles

Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,User,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034
1,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt? Erzählen S...,User,0.000420,0.993392,0.000040,0.005122,0.000077,0.000949
2,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse Der Sommer is...,User,0.000467,0.999213,0.000107,0.000086,0.000019,0.000109
3,7,2014-09-26 14:00:11.00,"Keine Scheu vor der Community! Diskutieren, wo...",User,0.000164,0.999707,0.000046,0.000018,0.000016,0.000049
4,7,2014-09-26 14:00:11.00,Inhalten fluten (Bild 2). Doch wie reagiert ma...,User,0.000050,0.999863,0.000022,0.000017,0.000014,0.000034
...,...,...,...,...,...,...,...,...,...,...
19801,12084,2016-05-31 17:39:29.00,"nicht erwarten, dass sie weder Geld zahlen noc...",Etat,0.000071,0.999852,0.000016,0.000014,0.000014,0.000034
19802,12084,2016-05-31 17:39:29.00,igenden Ergebnissen hinter uns. Verwertungsges...,Etat,0.003822,0.983958,0.000045,0.000407,0.000177,0.011590
19803,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur,0.025573,0.969986,0.001066,0.001175,0.000113,0.002087
19804,12086,2016-05-31 18:08:20.00,"seinen, und schon gar nicht unterstützt er ihr...",Kultur,0.529184,0.293101,0.001731,0.014493,0.001042,0.160449


In [12]:
articles = combine_chunks(articles, 'body_text', 'ID_Article', articles_meta)
articles

Unnamed: 0,ID_Article,Anger,Fear,Disgust,Sadness,Joy,None,dominant_emotion,publishingDate,NewsroomTopic
0,4,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear,2014-08-13 05:30:00.00,User
1,5,0.000420,0.993392,0.000040,0.005122,0.000077,0.000949,Fear,2014-08-27 12:27:01.09,User
2,6,0.000467,0.999213,0.000107,0.000086,0.000019,0.000109,Fear,2014-09-30 09:56:00.00,User
3,7,0.000093,0.999797,0.000035,0.000019,0.000016,0.000040,Fear,2014-09-26 14:00:11.00,User
4,8,0.000512,0.998840,0.000021,0.000208,0.000037,0.000383,Fear,2014-11-13 10:43:36.00,User
...,...,...,...,...,...,...,...,...,...,...
10177,12081,0.678981,0.106386,0.030008,0.004693,0.002463,0.177469,Anger,2016-05-31 17:07:43.00,Panorama
10178,12083,0.066811,0.495182,0.000749,0.072862,0.001473,0.362923,Fear,2016-05-31 16:14:13.00,Kultur
10179,12084,0.000669,0.997343,0.000017,0.000094,0.000043,0.001833,Fear,2016-05-31 17:39:29.00,Etat
10180,12086,0.198609,0.737415,0.001294,0.005751,0.000432,0.056499,Fear,2016-05-31 18:08:20.00,Kultur


In [13]:
articles[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.110376,0.568691,0.006578,0.025298,0.017443,0.271614
std,0.178378,0.422843,0.042703,0.096364,0.088127,0.331586
min,2.7e-05,4.1e-05,8e-06,1.3e-05,1e-05,1.8e-05
max,0.972735,0.999898,0.970016,0.999526,0.999089,0.996737


In [14]:
articles['dominant_emotion'].value_counts()

dominant_emotion
Fear       6189
None       2907
Anger       766
Sadness     159
Joy         121
Disgust      40
Name: count, dtype: int64

In [15]:
descriptive_statistics(articles)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.152833,0.223984,0.400803,0.424294,0.006679,0.050491,0.024742,0.093174,0.019134,0.089318,0.39581,0.380711
1,Inland,0.14268,0.199555,0.595482,0.397745,0.001333,0.010509,0.006866,0.046121,0.006895,0.04972,0.246744,0.313543
2,International,0.093335,0.143491,0.661256,0.392464,0.007856,0.039432,0.017167,0.069673,0.003099,0.020709,0.217288,0.292304
3,Kultur,0.095454,0.1826,0.599085,0.419259,0.005882,0.033076,0.08073,0.211247,0.019343,0.088952,0.199506,0.311741
4,Panorama,0.090984,0.143287,0.665377,0.385546,0.023785,0.086455,0.033464,0.110797,0.009763,0.075584,0.176626,0.247844
5,Sport,0.096744,0.134343,0.272194,0.364558,0.001486,0.010516,0.070188,0.138693,0.076198,0.177616,0.48319,0.336161
6,User,0.180953,0.325564,0.689739,0.414789,0.000535,0.001506,0.010331,0.05951,0.035801,0.161096,0.082642,0.204544
7,Web,0.109433,0.194587,0.574128,0.433709,0.002157,0.019769,0.00603,0.041845,0.012601,0.066163,0.295652,0.364018
8,Wirtschaft,0.14759,0.204827,0.570235,0.418626,0.000419,0.000911,0.00712,0.040092,0.003465,0.024614,0.271172,0.321876
9,Wissenschaft,0.045748,0.107194,0.725057,0.394501,0.001969,0.011736,0.012681,0.053146,0.012416,0.076932,0.202129,0.323118


Posts

In [None]:
posts = pd.read_sql_query("SELECT * FROM Posts", con)
posts_meta = ['ID_Post', 'ID_Article', 'CreatedAt', 'NewsroomTopic']
posts['NewsroomTopic'] = posts['ID_Article'].map(articles.set_index('ID_Article')['NewsroomTopic'])
posts

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes,NewsroomTopic
0,1,,1,9089,2003-04-23 14:52:41.870,deleted,,,0,0,
1,2,,1,29367,2003-11-04 16:21:57.850,online,"Newsletter ""DER STANDARD""",Ich bin begeistert von den STANDARD - Newslett...,0,0,
2,3,2.0,1,5095,2004-01-28 12:57:28.240,deleted,Auch begeistert...,... Aber momentan funktioniert das Abmelden od...,0,0,
3,4,3.0,1,1682,2004-02-03 20:32:39.123,deleted,Abmeldeprobleme,Es ist ganz einfach nervend!\r\nVor kurzem hab...,0,0,
4,5,,1,3343,2004-03-02 11:37:44.100,online,,und sie als mitarbeiter sind natuerlich objektiv,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
1011768,1011769,1011764.0,12087,6355,2016-06-01 23:10:29.003,online,,zwischen der beendigung eines arbeitsverhältni...,0,0,Kultur
1011769,1011770,1011765.0,12087,6355,2016-06-01 23:11:14.790,online,,du sagst es ja im namen selbst: wegen eindicku...,1,0,Kultur
1011770,1011771,1011770.0,12087,27023,2016-06-02 08:16:56.690,online,,Was genau haben Sie denn nicht verstanden? Ich...,0,1,Kultur
1011771,1011772,1011769.0,12087,19159,2016-06-02 09:12:09.993,online,,irgendwie widersprechen Sie sich in Ihrem Post...,1,0,Kultur


In [17]:
posts = posts[posts['ID_Article'].isin(articles['ID_Article'])]
posts = posts.dropna()
posts = posts[posts['Body'].astype(str).str.strip() != '']
posts

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes,NewsroomTopic
190,191,190.0,4,27461,2014-08-13 06:29:59.987,online,,Eben diese Erfahrung wollte ich auch teilen. D...,6,0,User
196,197,194.0,4,10467,2014-08-13 07:23:13.860,online,Was hat der Stance...,...mit der Linkshändigkeit zu tun?,1,0,User
197,198,190.0,4,10467,2014-08-13 07:26:24.743,online,Abgesehen von der Griffergonomie ist eine reiß...,...der Klingen? Und präzises Schneiden mit ein...,0,0,User
200,201,197.0,4,15236,2014-08-13 07:33:33.407,online,"Nicht alles, aber einen kleinen Zusammenhang g...","Ich kenn deutlich mehr Linkshänder, die aufm S...",1,0,User
201,202,193.0,4,26471,2014-08-13 07:34:11.377,online,,"# Besteckhalten mach ich genauso, also ""einhän...",1,0,User
...,...,...,...,...,...,...,...,...,...,...,...
1010855,1010856,1010846.0,12078,6840,2016-06-02 08:20:08.213,online,"""das verdünnt doch die Fixkosten""",Ein Wirtschaftsstudium hätte Ihnen auch nicht ...,0,0,Inland
1010858,1010859,1010858.0,12078,12880,2016-06-02 13:55:13.073,online,Fortsetzung,Ich hasse mein dummes 16-jähriges Ich dafür. H...,0,0,Inland
1010870,1010871,1010868.0,12079,14509,2016-05-31 22:56:09.220,online,@ dark passenger,Was genau ist dein Problem?,0,0,Web
1011762,1011763,1011761.0,12087,4863,2016-05-31 23:47:32.010,online,"Trotzdem ist es nicht uninteressant, dass in d...","und dafür immer Argumente sucht, es nicht zu m...",10,1,Kultur


In [None]:
posts = posts[posts['ID_Article'].isin(articles['ID_Article'])]
posts = posts.dropna()
posts = posts[posts['Body'].astype(str).str.strip() != '']

posts = chunk_text(posts, 'Body')
posts = posts[['ID_Post','ID_Article','CreatedAt','Body', 'NewsroomTopic']]
posts

Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body
0,191,4,2014-08-13 06:29:59.987,Eben diese Erfahrung wollte ich auch teilen. D...
1,197,4,2014-08-13 07:23:13.860,...mit der Linkshändigkeit zu tun?
2,198,4,2014-08-13 07:26:24.743,...der Klingen? Und präzises Schneiden mit ein...
3,201,4,2014-08-13 07:33:33.407,"Ich kenn deutlich mehr Linkshänder, die aufm S..."
4,202,4,2014-08-13 07:34:11.377,"# Besteckhalten mach ich genauso, also ""einhän..."
...,...,...,...,...
131543,1010856,12078,2016-06-02 08:20:08.213,Ein Wirtschaftsstudium hätte Ihnen auch nicht ...
131544,1010859,12078,2016-06-02 13:55:13.073,Ich hasse mein dummes 16-jähriges Ich dafür. H...
131545,1010871,12079,2016-05-31 22:56:09.220,Was genau ist dein Problem?
131546,1011763,12087,2016-05-31 23:47:32.010,"und dafür immer Argumente sucht, es nicht zu m..."


In [19]:
posts = sentiment_analysis(posts,'Body')
posts

Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body,Anger,Fear,Disgust,Sadness,Joy,None
0,191,4,2014-08-13 06:29:59.987,Eben diese Erfahrung wollte ich auch teilen. D...,0.000121,0.999683,0.000045,0.000048,0.000020,0.000083
1,197,4,2014-08-13 07:23:13.860,...mit der Linkshändigkeit zu tun?,0.989465,0.000134,0.000689,0.000854,0.000463,0.008395
2,198,4,2014-08-13 07:26:24.743,...der Klingen? Und präzises Schneiden mit ein...,0.107656,0.002562,0.018382,0.005423,0.001437,0.864540
3,201,4,2014-08-13 07:33:33.407,"Ich kenn deutlich mehr Linkshänder, die aufm S...",0.432072,0.009020,0.002036,0.364196,0.002437,0.190239
4,202,4,2014-08-13 07:34:11.377,"# Besteckhalten mach ich genauso, also ""einhän...",0.982516,0.000362,0.000630,0.001408,0.000440,0.014644
...,...,...,...,...,...,...,...,...,...,...
131543,1010856,12078,2016-06-02 08:20:08.213,Ein Wirtschaftsstudium hätte Ihnen auch nicht ...,0.312902,0.000244,0.000468,0.005681,0.058506,0.622199
131544,1010859,12078,2016-06-02 13:55:13.073,Ich hasse mein dummes 16-jähriges Ich dafür. H...,0.071200,0.065593,0.003848,0.852698,0.000342,0.006319
131545,1010871,12079,2016-05-31 22:56:09.220,Was genau ist dein Problem?,0.391645,0.000826,0.000555,0.003912,0.001411,0.601651
131546,1011763,12087,2016-05-31 23:47:32.010,"und dafür immer Argumente sucht, es nicht zu m...",0.995226,0.000362,0.000358,0.000621,0.000283,0.003149


In [25]:
posts = combine_chunks(posts, 'Body', 'ID_Post', posts_meta)
posts

Unnamed: 0,ID_Post,Anger,Fear,Disgust,Sadness,Joy,None,dominant_emotion,ID_Article,CreatedAt,NewsroomTopic
0,191,0.000121,0.999683,0.000045,0.000048,0.000020,0.000083,Fear,4,2014-08-13 06:29:59.987,User
1,197,0.989465,0.000134,0.000689,0.000854,0.000463,0.008395,Anger,4,2014-08-13 07:23:13.860,User
2,198,0.107656,0.002562,0.018382,0.005423,0.001437,0.864540,,4,2014-08-13 07:26:24.743,User
3,201,0.432072,0.009020,0.002036,0.364196,0.002437,0.190239,Anger,4,2014-08-13 07:33:33.407,User
4,202,0.982516,0.000362,0.000630,0.001408,0.000440,0.014644,Anger,4,2014-08-13 07:34:11.377,User
...,...,...,...,...,...,...,...,...,...,...,...
131543,1010856,0.312902,0.000244,0.000468,0.005681,0.058506,0.622199,,12078,2016-06-02 08:20:08.213,Inland
131544,1010859,0.071200,0.065593,0.003848,0.852698,0.000342,0.006319,Sadness,12078,2016-06-02 13:55:13.073,Inland
131545,1010871,0.391645,0.000826,0.000555,0.003912,0.001411,0.601651,,12079,2016-05-31 22:56:09.220,Web
131546,1011763,0.995226,0.000362,0.000358,0.000621,0.000283,0.003149,Anger,12087,2016-05-31 23:47:32.010,Kultur


In [26]:
posts[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.490654,0.214464,0.013468,0.023225,0.030886,0.227303
std,0.426129,0.391192,0.097253,0.109986,0.151105,0.343132
min,2.2e-05,6e-06,7e-06,1.1e-05,8e-06,1.4e-05
max,0.997273,0.99991,0.999891,0.999748,0.999762,0.998009


In [27]:
posts['dominant_emotion'].value_counts()

dominant_emotion
Anger      67382
None       28326
Fear       28255
Joy         3764
Sadness     2260
Disgust     1561
Name: count, dtype: int64

In [28]:
descriptive_statistics(posts)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.504483,0.427121,0.187954,0.371552,0.025028,0.138837,0.023103,0.114507,0.032854,0.160862,0.226578,0.346374
1,Inland,0.515198,0.428397,0.208383,0.387301,0.01152,0.090442,0.021169,0.104062,0.025712,0.136281,0.218018,0.33913
2,International,0.497382,0.426739,0.230975,0.401547,0.012227,0.08937,0.021544,0.105336,0.021172,0.124086,0.2167,0.337523
3,Kultur,0.425151,0.424599,0.180436,0.367984,0.017941,0.109199,0.05134,0.177846,0.080054,0.244554,0.245077,0.356943
4,Panorama,0.472868,0.426056,0.242127,0.409675,0.017538,0.111084,0.025079,0.115714,0.025525,0.138055,0.216863,0.336992
5,Sport,0.48624,0.418473,0.131237,0.319695,0.0099,0.078791,0.034553,0.129373,0.059907,0.206462,0.278164,0.362573
6,User,0.437847,0.4239,0.237139,0.407333,0.018975,0.12048,0.039673,0.155151,0.051437,0.200072,0.214929,0.335287
7,Web,0.458668,0.421836,0.204901,0.383174,0.015349,0.106505,0.018557,0.095068,0.043257,0.179126,0.259267,0.360232
8,Wirtschaft,0.523967,0.424878,0.204658,0.382364,0.007967,0.073559,0.017671,0.093953,0.026469,0.138969,0.219268,0.337036
9,Wissenschaft,0.405912,0.416099,0.258271,0.421806,0.019032,0.118663,0.026585,0.116373,0.036306,0.162802,0.253893,0.356568


Prepare File

In [29]:
articles.rename(columns={
    "Anger": "anger_a",
    "Fear": "fear_a",
    "Disgust": "disgust_a",
    "Sadness": "sadness_a",
    "Joy": "joy_a",
    "None": "none_a",
    "dominant_emotion": "dominant_emotion_a"
}, inplace=True)

posts.rename(columns={
    "Anger": "anger_p",
    "Fear": "fear_p",
    "Disgust": "disgust_p",
    "Sadness": "sadness_p",
    "Joy": "joy_p",
    "None": "none_p",
    "dominant_emotion": "dominant_emotion_p"
}, inplace=True)

data = pd.merge(articles, posts, on=["ID_Article", "NewsroomTopic"], how="right")

data["publishingDate"] = pd.to_datetime(data['publishingDate'])
data["CreatedAt"] = pd.to_datetime(data['CreatedAt'])


data = data[["ID_Post" ,"CreatedAt", "anger_p", "fear_p", "disgust_p", "sadness_p", "joy_p", "none_p", "dominant_emotion_p", "NewsroomTopic",
             "ID_Article", "publishingDate", "anger_a", "fear_a", "disgust_a", "sadness_a", "joy_a", "none_a", "dominant_emotion_a"]].copy()

data

Unnamed: 0,ID_Post,CreatedAt,anger_p,fear_p,disgust_p,sadness_p,joy_p,none_p,dominant_emotion_p,NewsroomTopic,ID_Article,publishingDate,anger_a,fear_a,disgust_a,sadness_a,joy_a,none_a,dominant_emotion_a
0,191,2014-08-13 06:29:59.987,0.000121,0.999683,0.000045,0.000048,0.000020,0.000083,Fear,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear
1,197,2014-08-13 07:23:13.860,0.989465,0.000134,0.000689,0.000854,0.000463,0.008395,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear
2,198,2014-08-13 07:26:24.743,0.107656,0.002562,0.018382,0.005423,0.001437,0.864540,,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear
3,201,2014-08-13 07:33:33.407,0.432072,0.009020,0.002036,0.364196,0.002437,0.190239,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear
4,202,2014-08-13 07:34:11.377,0.982516,0.000362,0.000630,0.001408,0.000440,0.014644,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000034,Fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131543,1010856,2016-06-02 08:20:08.213,0.312902,0.000244,0.000468,0.005681,0.058506,0.622199,,Inland,12078,2016-05-31 14:54:45,0.000040,0.999833,0.000011,0.000065,0.000021,0.000029,Fear
131544,1010859,2016-06-02 13:55:13.073,0.071200,0.065593,0.003848,0.852698,0.000342,0.006319,Sadness,Inland,12078,2016-05-31 14:54:45,0.000040,0.999833,0.000011,0.000065,0.000021,0.000029,Fear
131545,1010871,2016-05-31 22:56:09.220,0.391645,0.000826,0.000555,0.003912,0.001411,0.601651,,Web,12079,2016-05-31 09:43:18,0.000158,0.999668,0.000029,0.000017,0.000025,0.000101,Fear
131546,1011763,2016-05-31 23:47:32.010,0.995226,0.000362,0.000358,0.000621,0.000283,0.003149,Anger,Kultur,12087,2016-05-31 15:46:08,0.498800,0.109251,0.000761,0.007930,0.001188,0.382070,Anger


In [31]:
data.to_csv("combined_sentiments.csv", index=False)