In [1]:
import sqlite3
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
con = sqlite3.connect('../dataset/corpus.sqlite3')
model_path = "visegradmedia-emotion/Emotion_RoBERTa_german6_v7"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps" # For Mac M1/M2/M3
else:
    device = "cpu"

print(f"Using device: {device}")

batch_size = 32 if device != 'cpu' else 16
overlap = 64

Using device: cuda


In [3]:
max_length = 512
labels = {
    "LABEL_0": "Anger",
    "LABEL_1": "Fear",
    "LABEL_2": "Disgust",
    "LABEL_3": "Sadness",
    "LABEL_4": "Joy",
    "LABEL_5": "None"
}
emotions = list(labels.values())
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification(
  (classifier): XLMRobertaClassificationHead(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (out_proj): Linear(in_features=768, out_features=6, bias=True)
  )
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Li

In [4]:
def chunk_text(df, text_col):
    df[text_col] = df[text_col].str.strip()
    chunked_rows = []
    for _, row in df.iterrows():
        text = str(row[text_col])
        token_ids = tokenizer.encode(text, add_special_tokens=False)

        start = 0
        while start < len(token_ids):
            end = start + max_length
            chunk_ids = token_ids[start:end]
            chunk_text = tokenizer.decode(chunk_ids)

            chunk_row = row.copy()
            chunk_row[text_col] = chunk_text
            chunked_rows.append(chunk_row)

            if end >= len(token_ids):
                break
            start += max_length - overlap
    return pd.DataFrame(chunked_rows).reset_index(drop=True)

In [5]:
def sentiment_analysis(df, column):
    all_scores = []
    model.to(device)
    label_names = [labels[f"LABEL_{i}"] for i in range(len(labels))]
    
    for b in range(0, len(df), batch_size):
        batch_texts = df[column].iloc[b:b + batch_size].tolist()

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
            add_special_tokens=True
        ).to(device)

        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        all_scores.append(probs.cpu().detach().numpy())

    if all_scores:
        all_scores_np = np.vstack(all_scores)
        df_scores = pd.DataFrame(all_scores_np, columns=label_names)
        return pd.concat([df.reset_index(drop=True), df_scores], axis=1)
    else:
        return df

In [13]:
def combine_chunks(df, text_col, group_col, metadata_cols):
    df['chunk_tokens'] = df[text_col].apply(
        lambda x: len(tokenizer.encode(str(x), add_special_tokens=False))
    )

    # Multiply scores by weights
    weighted_cols = []
    for emo in emotions:
        df[f'{emo}_weighted'] = df[emo] * df['chunk_tokens']
        weighted_cols.append(f'{emo}_weighted')
    
    # Group and sum
    grouped = df.groupby(group_col)
    sums = grouped[weighted_cols + ['chunk_tokens']].sum()
    
    # Divide by total tokens to get weighted average
    result = pd.DataFrame(index=sums.index)
    for emo in emotions:
        result[emo] = sums[f'{emo}_weighted'] / sums['chunk_tokens']

    # Determine dominant emotion
    result['dominant_emotion'] = result[emotions].idxmax(axis=1)
    
    # Merge back
    # Get metadata from the original df (take the first occurrence per group)
    cols_to_select = metadata_cols.copy()
    if group_col not in cols_to_select:
        cols_to_select.append(group_col)
        
    meta_df = df[cols_to_select].drop_duplicates(subset=group_col).set_index(group_col)
    
    return result.join(meta_df).reset_index()

In [7]:
def descriptive_statistics(df):
    topic_dict = {}
    for topic, group in df.groupby('NewsroomTopic'):
        descriptives = {}
        for e in labels.values():
            descriptives[f"{e}_mean"] = group[e].mean()
            descriptives[f"{e}_std"] = group[e].std()
        topic_dict[topic] = descriptives
        
    table = pd.DataFrame.from_dict(topic_dict, orient='index')
    return table.reset_index().rename(columns={'index': 'NewsroomTopic'})

# Articles preprocessing

In [27]:
articles = pd.read_sql_query("SELECT * FROM Articles", con)
articles_meta = ['ID_Article', 'publishingDate', 'NewsroomTopic']
articles['body_text'] = articles.apply(
    lambda row: str(row['Title']) + " " +
    BeautifulSoup(str(row['Body']), "html.parser").get_text(),
    axis=1)
articles

Unnamed: 0,ID_Article,Path,publishingDate,Title,Body,body_text
0,1,Newsroom/User/Community,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Newsletter von derStandard.at Abonnieren S...
1,2,Newsroom/User/Community/Regeln,2012-05-26 12:12:19.46,Werden Sie Teil von derStandard.at!,"<div class=""diashow"" id=""objectContent""><meta ...",Werden Sie Teil von derStandard.at! Werden Sie...
2,3,Diverses/mobil,2013-11-22 12:15:00.00,Die Android App von derStandard.at,"<div class=""section"" id=""content-main"" itempro...",Die Android App von derStandard.at Die Smartph...
3,4,Newsroom/User/mitmachen/Mitreden,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,"<div class=""section"" id=""content-main"" itempro...",Welche Erfahrungen haben Sie als Linkshänder g...
4,5,Newsroom/User/mitmachen/Mitreden,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt?,"<div class=""section"" id=""content-main"" itempro...",Wie haben Sie das Jahr 1989 erlebt? Erzählen S...
...,...,...,...,...,...,...
12082,12083,Newsroom/Kultur/Musikkultur,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule,"<div class=""section"" id=""content-main"" itempro...",Max Prosa: Junger Troubadour alter Schule Von ...
12083,12084,Newsroom/Etat/PRINT/Springer,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen""","<div class=""section"" id=""content-main"" itempro...","""Können Adblocker nicht einfach hinnehmen"" Med..."
12084,12085,Meinung/Kolumnen/rau,2016-05-31 17:34:54.00,Die Rechten machen Facebook zum Hatebook,"<div class=""section"" id=""content-main"" itempro...",Die Rechten machen Facebook zum Hatebook Die F...
12085,12086,Newsroom/Kultur/Buehne,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...","<div class=""section"" id=""content-main"" itempro...","""Wrestling Rita"": Feministischer Punktsieg im ..."


In [29]:
articles['MainPath'] = articles['Path'].apply(lambda x: str(x).split('/')[0])
articles = articles[articles['MainPath'] == 'Newsroom'].copy()
articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])

#We dont care about user posts, only real jouranlists
articles = articles.drop(articles[articles['NewsroomTopic']=="User"].index)

articles = articles.loc[~articles['Title'].str.contains('derStandard.at', case=False, na=False)]

articles['num_tokens'] = articles['body_text'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)))
articles = articles[articles['num_tokens'] >= overlap]

articles = articles[['ID_Article', 'publishingDate', 'body_text', 'NewsroomTopic']]
articles

Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
9,10,2015-02-07 17:00:00.00,Community-Quiz #2: Können Sie die Postings in ...,Wissenschaft
11,12,2015-03-20 15:11:50.00,Android Auto geht in den USA an den Start Zuge...,Web
14,15,2015-05-31 12:17:05.01,US-Außenminister Kerry nach Fahrradunfall im K...,International
15,16,2015-05-31 12:09:51.00,Hundefleisch wird in Südkorea immer unbeliebte...,Panorama
17,18,2015-05-31 20:58:40.00,FPÖ bei pessimistischen Steirern am stärksten ...,Inland
...,...,...,...,...
12080,12081,2016-05-31 17:07:43.00,Niederösterreich: Zollfahnder beschlagnahmten ...,Panorama
12082,12083,2016-05-31 16:14:13.00,Max Prosa: Junger Troubadour alter Schule Von ...,Kultur
12083,12084,2016-05-31 17:39:29.00,"""Können Adblocker nicht einfach hinnehmen"" Med...",Etat
12085,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur


In [30]:
articles = chunk_text(articles, 'body_text')
articles

Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic
0,10,2015-02-07 17:00:00.00,Community-Quiz #2: Können Sie die Postings in ...,Wissenschaft
1,12,2015-03-20 15:11:50.00,Android Auto geht in den USA an den Start Zuge...,Web
2,15,2015-05-31 12:17:05.01,US-Außenminister Kerry nach Fahrradunfall im K...,International
3,16,2015-05-31 12:09:51.00,Hundefleisch wird in Südkorea immer unbeliebte...,Panorama
4,16,2015-05-31 12:09:51.00,nen. Für Kang Dae-in ist das koreanische Leibg...,Panorama
...,...,...,...,...
18742,12084,2016-05-31 17:39:29.00,Hier ist ein signifikanter sechsstelliger Betr...,Etat
18743,12084,2016-05-31 17:39:29.00,"auf rund 3,3 Milliarden Euro. Das digitale Ges...",Etat
18744,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur
18745,12086,2016-05-31 18:08:20.00,"""Humungus"" Hradil, Mitbegründer der Wrestling ...",Kultur


In [31]:
articles = sentiment_analysis(articles, 'body_text')
articles

Unnamed: 0,ID_Article,publishingDate,body_text,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,10,2015-02-07 17:00:00.00,Community-Quiz #2: Können Sie die Postings in ...,Wissenschaft,0.026524,0.019860,0.000412,0.001565,0.016253,0.935386
1,12,2015-03-20 15:11:50.00,Android Auto geht in den USA an den Start Zuge...,Web,0.011033,0.000790,0.000114,0.000590,0.009938,0.977536
2,15,2015-05-31 12:17:05.01,US-Außenminister Kerry nach Fahrradunfall im K...,International,0.112814,0.004223,0.001938,0.058318,0.003376,0.819331
3,16,2015-05-31 12:09:51.00,Hundefleisch wird in Südkorea immer unbeliebte...,Panorama,0.000044,0.999874,0.000018,0.000027,0.000012,0.000025
4,16,2015-05-31 12:09:51.00,nen. Für Kang Dae-in ist das koreanische Leibg...,Panorama,0.000063,0.999839,0.000033,0.000020,0.000014,0.000031
...,...,...,...,...,...,...,...,...,...,...
18742,12084,2016-05-31 17:39:29.00,Hier ist ein signifikanter sechsstelliger Betr...,Etat,0.000188,0.999511,0.000014,0.000031,0.000024,0.000232
18743,12084,2016-05-31 17:39:29.00,"auf rund 3,3 Milliarden Euro. Das digitale Ges...",Etat,0.046454,0.016544,0.000133,0.005761,0.005960,0.925147
18744,12086,2016-05-31 18:08:20.00,"""Wrestling Rita"": Feministischer Punktsieg im ...",Kultur,0.026282,0.969194,0.001090,0.001196,0.000115,0.002123
18745,12086,2016-05-31 18:08:20.00,"""Humungus"" Hradil, Mitbegründer der Wrestling ...",Kultur,0.695503,0.030447,0.002946,0.012074,0.001049,0.257980


In [32]:
articles = combine_chunks(articles, 'body_text', 'ID_Article', articles_meta)
articles

Unnamed: 0,ID_Article,Anger,Fear,Disgust,Sadness,Joy,None,dominant_emotion,publishingDate,NewsroomTopic
0,10,0.026524,0.019860,0.000412,0.001565,0.016253,0.935386,,2015-02-07 17:00:00.00,Wissenschaft
1,12,0.011033,0.000790,0.000114,0.000590,0.009938,0.977536,,2015-03-20 15:11:50.00,Web
2,15,0.112814,0.004223,0.001938,0.058318,0.003376,0.819331,,2015-05-31 12:17:05.01,International
3,16,0.000049,0.999857,0.000029,0.000023,0.000014,0.000027,Fear,2015-05-31 12:09:51.00,Panorama
4,18,0.000110,0.999350,0.000016,0.000111,0.000052,0.000362,Fear,2015-05-31 20:58:40.00,Inland
...,...,...,...,...,...,...,...,...,...,...
10130,12081,0.679480,0.104902,0.029647,0.004698,0.002477,0.178796,Anger,2016-05-31 17:07:43.00,Panorama
10131,12083,0.071449,0.541087,0.000829,0.081418,0.002716,0.302501,Fear,2016-05-31 16:14:13.00,Kultur
10132,12084,0.002525,0.948069,0.000018,0.000341,0.000330,0.048716,Fear,2016-05-31 17:39:29.00,Etat
10133,12086,0.216954,0.701730,0.001619,0.004295,0.000381,0.075021,Fear,2016-05-31 18:08:20.00,Kultur


In [33]:
articles[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.113015,0.554797,0.006669,0.026339,0.017419,0.28176
std,0.179113,0.42364,0.042703,0.098663,0.087125,0.33372
min,2.4e-05,3.5e-05,8e-06,1.3e-05,1e-05,1.8e-05
max,0.973096,0.999905,0.969769,0.999524,0.999088,0.996742


In [34]:
articles['dominant_emotion'].value_counts()

dominant_emotion
Fear       6038
None       2986
Anger       784
Sadness     170
Joy         119
Disgust      38
Name: count, dtype: int64

In [36]:
descriptive_statistics(articles)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.155839,0.22716,0.387728,0.419714,0.006657,0.049896,0.026404,0.096824,0.020487,0.09272,0.402885,0.37878
1,Inland,0.148231,0.205762,0.580644,0.40262,0.001147,0.0075,0.006992,0.04638,0.007355,0.050355,0.255631,0.317454
2,International,0.096699,0.147509,0.650875,0.39493,0.007968,0.03955,0.018428,0.073209,0.003293,0.021249,0.222738,0.292611
3,Kultur,0.097512,0.187936,0.589523,0.424829,0.006045,0.035044,0.086073,0.218834,0.018774,0.089559,0.202074,0.315493
4,Panorama,0.093293,0.14629,0.657509,0.387529,0.023849,0.085822,0.034348,0.112037,0.009914,0.075088,0.181086,0.250251
5,Sport,0.099148,0.136696,0.262946,0.360811,0.001497,0.010355,0.070583,0.138174,0.076379,0.177909,0.489448,0.33643
6,Web,0.112335,0.195938,0.565411,0.431577,0.00216,0.019601,0.00604,0.041599,0.01255,0.066053,0.301503,0.363543
7,Wirtschaft,0.154349,0.210733,0.550174,0.421505,0.000431,0.000928,0.006895,0.03879,0.003407,0.024152,0.284745,0.325786
8,Wissenschaft,0.049203,0.112292,0.713364,0.395246,0.002069,0.01193,0.013024,0.055552,0.012303,0.076752,0.210036,0.32182


# Comments

In [37]:
posts = pd.read_sql_query("SELECT * FROM Posts", con)
posts_meta = ['ID_Post', 'ID_Article', 'CreatedAt', 'NewsroomTopic']
posts['NewsroomTopic'] = posts['ID_Article'].map(articles.set_index('ID_Article')['NewsroomTopic'])
posts

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes,NewsroomTopic
0,1,,1,9089,2003-04-23 14:52:41.870,deleted,,,0,0,
1,2,,1,29367,2003-11-04 16:21:57.850,online,"Newsletter ""DER STANDARD""",Ich bin begeistert von den STANDARD - Newslett...,0,0,
2,3,2.0,1,5095,2004-01-28 12:57:28.240,deleted,Auch begeistert...,... Aber momentan funktioniert das Abmelden od...,0,0,
3,4,3.0,1,1682,2004-02-03 20:32:39.123,deleted,Abmeldeprobleme,Es ist ganz einfach nervend!\r\nVor kurzem hab...,0,0,
4,5,,1,3343,2004-03-02 11:37:44.100,online,,und sie als mitarbeiter sind natuerlich objektiv,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
1011768,1011769,1011764.0,12087,6355,2016-06-01 23:10:29.003,online,,zwischen der beendigung eines arbeitsverhältni...,0,0,Kultur
1011769,1011770,1011765.0,12087,6355,2016-06-01 23:11:14.790,online,,du sagst es ja im namen selbst: wegen eindicku...,1,0,Kultur
1011770,1011771,1011770.0,12087,27023,2016-06-02 08:16:56.690,online,,Was genau haben Sie denn nicht verstanden? Ich...,0,1,Kultur
1011771,1011772,1011769.0,12087,19159,2016-06-02 09:12:09.993,online,,irgendwie widersprechen Sie sich in Ihrem Post...,1,0,Kultur


In [38]:
posts = posts[posts['ID_Article'].isin(articles['ID_Article'])]
posts = posts.dropna(subset=['Body','NewsroomTopic'])
posts = posts[posts['Body'].astype(str).str.strip() != '']
posts = posts[posts['ID_Parent_Post'].isna()]
posts = posts[['ID_Post','ID_Article','CreatedAt','Body', 'NewsroomTopic']]
posts

Unnamed: 0,ID_Post,ID_Article,CreatedAt,Body,NewsroomTopic
2083,2084,10,2015-02-07 17:11:43.933,ich soll also mehr mitposten.,Wissenschaft
2085,2086,10,2015-02-07 17:29:14.757,Höhö 7 von 10,Wissenschaft
2087,2088,10,2015-02-07 17:30:52.763,"Buh, 1 Prozent unter dem Durchschnitt. Ein Glü...",Wissenschaft
2090,2091,10,2015-02-07 17:44:51.023,8 von 10. Alles geraten.,Wissenschaft
2091,2092,10,2015-02-07 17:55:30.990,die fpö hat mich rausgerissen,Wissenschaft
...,...,...,...,...,...
1010993,1010994,12083,2016-05-31 17:36:21.730,"""Singer-Songwriter"" ist keine Neuerfindung des...",Kultur
1010994,1010995,12083,2016-06-01 05:24:53.540,Bei ordentlichen medien gibts dann immer einen...,Kultur
1010996,1010997,12084,2016-05-31 17:54:24.000,"Die User haben hier, haben es doch oft formuli...",Etat
1011759,1011760,12086,2016-05-31 21:10:42.487,Na ja. Und wozu?,Kultur


In [39]:
posts = sentiment_analysis(posts,'Body')
posts['dominant_emotion'] = posts[emotions].idxmax(axis=1)
posts

KeyboardInterrupt: 

In [18]:
posts[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.491233,0.213216,0.013411,0.023097,0.030817,0.228227
std,0.42612,0.390424,0.097055,0.109475,0.150872,0.343662
min,2.2e-05,6e-06,7e-06,1.1e-05,8e-06,1.4e-05
max,0.997274,0.99991,0.99989,0.999747,0.999763,0.998005


In [19]:
posts['dominant_emotion'].value_counts()

dominant_emotion
Anger      67760
None       28543
Fear       28233
Joy         3767
Sadness     2251
Disgust     1567
Name: count, dtype: int64

In [20]:
descriptive_statistics(posts)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.504402,0.426727,0.186374,0.370279,0.025247,0.139414,0.023276,0.11513,0.032983,0.16098,0.227719,0.346766
1,Inland,0.516361,0.428303,0.206627,0.38621,0.01143,0.089975,0.021041,0.103512,0.02575,0.136413,0.21879,0.339531
2,International,0.49749,0.426647,0.229763,0.400815,0.012268,0.089718,0.021421,0.104662,0.02132,0.124565,0.217739,0.338059
3,Kultur,0.42592,0.424721,0.178754,0.366872,0.017802,0.108524,0.051453,0.177733,0.079928,0.244519,0.246143,0.35744
4,Panorama,0.473676,0.426083,0.240642,0.408875,0.017516,0.11109,0.025044,0.115429,0.0255,0.137973,0.217623,0.337459
5,Sport,0.485871,0.418484,0.130685,0.318983,0.009695,0.077482,0.034133,0.12804,0.059614,0.205682,0.280003,0.363363
6,User,0.438504,0.424003,0.235944,0.40649,0.018902,0.120365,0.039579,0.15524,0.051317,0.199677,0.215754,0.335665
7,Web,0.459208,0.422017,0.203926,0.382629,0.015293,0.106449,0.018432,0.094796,0.042992,0.178551,0.260149,0.360851
8,Wirtschaft,0.52412,0.424994,0.204208,0.38221,0.007894,0.073192,0.01755,0.09349,0.02622,0.13809,0.220008,0.337619
9,Wissenschaft,0.40658,0.415734,0.256275,0.420847,0.018971,0.118306,0.026145,0.114907,0.036376,0.163246,0.255653,0.357227


Prepare File

In [21]:
articles.rename(columns={
    "Anger": "anger_a",
    "Fear": "fear_a",
    "Disgust": "disgust_a",
    "Sadness": "sadness_a",
    "Joy": "joy_a",
    "None": "none_a",
    "dominant_emotion": "dominant_emotion_a"
}, inplace=True)

posts.rename(columns={
    "Anger": "anger_p",
    "Fear": "fear_p",
    "Disgust": "disgust_p",
    "Sadness": "sadness_p",
    "Joy": "joy_p",
    "None": "none_p",
    "dominant_emotion": "dominant_emotion_p"
}, inplace=True)

data = pd.merge(articles, posts, on=["ID_Article", "NewsroomTopic"], how="right")
data["publishingDate"] = pd.to_datetime(data['publishingDate'])
data["CreatedAt"] = pd.to_datetime(data['CreatedAt'])

data = data[["ID_Post" ,"CreatedAt", "anger_p", "fear_p", "disgust_p", "sadness_p", "joy_p", "none_p", "dominant_emotion_p", "NewsroomTopic",
             "ID_Article", "publishingDate", "anger_a", "fear_a", "disgust_a", "sadness_a", "joy_a", "none_a", "dominant_emotion_a"]]
data

Unnamed: 0,ID_Post,CreatedAt,anger_p,fear_p,disgust_p,sadness_p,joy_p,none_p,dominant_emotion_p,NewsroomTopic,ID_Article,publishingDate,anger_a,fear_a,disgust_a,sadness_a,joy_a,none_a,dominant_emotion_a
0,191,2014-08-13 06:29:59.987,0.000120,0.999689,0.000045,0.000047,0.000020,0.000079,Fear,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
1,197,2014-08-13 07:23:13.860,0.989551,0.000133,0.000688,0.000847,0.000460,0.008320,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
2,198,2014-08-13 07:26:24.743,0.108701,0.002560,0.018449,0.005487,0.001450,0.863353,,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
3,201,2014-08-13 07:33:33.407,0.430212,0.008964,0.002008,0.366200,0.002447,0.190168,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
4,202,2014-08-13 07:34:11.377,0.981684,0.000288,0.000668,0.001571,0.000484,0.015305,Anger,User,4,2014-08-13 05:30:00,0.000047,0.999862,0.000017,0.000024,0.000017,0.000033,Fear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132116,1010856,2016-06-02 08:20:08.213,0.325956,0.000242,0.000464,0.005642,0.053380,0.614316,,Inland,12078,2016-05-31 14:54:45,0.000040,0.999832,0.000011,0.000066,0.000021,0.000029,Fear
132117,1010859,2016-06-02 13:55:13.073,0.072507,0.066935,0.003864,0.849980,0.000343,0.006371,Sadness,Inland,12078,2016-05-31 14:54:45,0.000040,0.999832,0.000011,0.000066,0.000021,0.000029,Fear
132118,1010871,2016-05-31 22:56:09.220,0.386056,0.000832,0.000550,0.003874,0.001406,0.607282,,Web,12079,2016-05-31 09:43:18,0.000158,0.999671,0.000029,0.000017,0.000025,0.000100,Fear
132119,1011763,2016-05-31 23:47:32.010,0.995209,0.000364,0.000361,0.000621,0.000286,0.003159,Anger,Kultur,12087,2016-05-31 15:46:08,0.499322,0.110606,0.000751,0.007877,0.001192,0.380252,Anger


In [None]:
data.to_csv("../dataset/combined_sentiments.csv", index=False)