Sentiment Analysis

In [1]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
from transformers import AutoTokenizer
from optimum.intel.openvino import OVModelForSequenceClassification
import torch

In [12]:
model_path = "RobertaGerman"
con = sqlite3.connect('corpus.sqlite3')
batch_size = 8
device = "GPU"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = OVModelForSequenceClassification.from_pretrained(model_path, device=device)
labels = {
    "LABEL_0": "Anger",
    "LABEL_1": "Fear",
    "LABEL_2": "Disgust",
    "LABEL_3": "Sadness",
    "LABEL_4": "Joy",
    "LABEL_5": "None"
}

def sentiment_analysis(df, column):
    df[column] = df[column].astype(str).str.strip()
    all_scores = []
    for b in range(0, len(df), batch_size):
        batch_texts = df[column].iloc[b:b + batch_size].tolist()
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        score_list = []
        for p in range(len(probs)):
            probabilities = probs[p]
            score_dict = {}
            for i, l in enumerate(labels.keys()):
                score_dict[labels[l]] = float(probabilities[i])
            score_list.append(score_dict)
        all_scores.extend(score_list)
    df_scores = pd.DataFrame(all_scores)
    df = pd.concat([df.reset_index(drop=True), df_scores], axis=1)
    return df

def descriptive_statistics(df):
    topic_dict = {}
    for topic, group in df.groupby('NewsroomTopic'):
        descriptives = {}
        for e in labels.values():
            descriptives[f"{e}_mean"] = group[e].mean()
            descriptives[f"{e}_std"] = group[e].std()
        topic_dict[topic] = descriptives
    table = pd.DataFrame.from_dict(topic_dict, orient='index')
    table = table.reset_index().rename(columns={'index': 'NewsroomTopic'})
    return table

No OpenVINO files were found for RobertaGerman, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


Articles

In [4]:
articles = pd.read_sql_query("SELECT * FROM Articles", con)
articles.head(1)

Unnamed: 0,ID_Article,Path,publishingDate,Title,Body
0,1,Newsroom/User/Community,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,"<div class=""section"" id=""content-main"" itempro..."


In [5]:
articles['body_text'] = articles['Body'].apply(lambda x: BeautifulSoup(str(x), 'html.parser').get_text())
articles['MainPath'] = articles['Path'].apply(lambda x: str(x).split('/')[0])
articles = articles[articles['MainPath'] == 'Newsroom'].copy()
articles['NewsroomTopic'] = articles['Path'].apply(lambda x: str(x).split('/')[1])
articles = articles[['ID_Article', 'publishingDate', 'Title', 'body_text', 'NewsroomTopic']].copy()
display(articles.iloc[1234]['body_text'])

'80 bis 90 Menschen vorübergehend evakuiert – Kriegsgerät entschärftMelk – In Melk ist am frühen Mittwochnachmittag eine 50-Kilogramm-Fliegerbombe entdeckt  worden. Ehe der Entminungsdienst das Kriegsgerät entschärfte, mussten 80 bis 90  Menschen vorübergehend evakuiert werden, teilte die Landespolizeidirektion Niederösterreich mit. Der Fundort sei im Nahbereich der Polizeiinspektion Melk  gelegen. Die Evakuierungsmaßnahmen wurden nach etwa 90 Minuten wieder  aufgehoben.\nDer Fund war der dritte einer Fliegerbombe in Niederösterreich binnen  acht Tagen. Am vergangenen Mittwoch war in St. Pölten ein 100 Kilo schweres  Kriegsgerät entdeckt worden, am Montag in Amstetten ein 500 Kilo schweres. (APA, 8.7.2015) '

In [6]:
articles = sentiment_analysis(df=articles, column='body_text')
articles.head()

Unnamed: 0,ID_Article,publishingDate,Title,body_text,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,1,2012-05-26 03:00:19.23,Die Newsletter von derStandard.at,Abonnieren Sie einen unserer Newsletter und la...,User,0.003462,0.000474,0.000194,0.000845,0.00842,0.986605
1,2,2012-05-26 12:12:19.46,Werden Sie Teil von derStandard.at!,Werden Sie Teil von derStandard.at!10. Dezembe...,User,0.005674,0.017478,0.00026,0.001113,0.021206,0.954268
2,4,2014-08-13 05:30:00.00,Welche Erfahrungen haben Sie als Linkshänder g...,Wie sieht Ihr Alltag als Linkshänder aus? Erle...,User,4.7e-05,0.999864,1.7e-05,2.2e-05,1.7e-05,3.3e-05
3,5,2014-08-27 12:27:01.09,Wie haben Sie das Jahr 1989 erlebt?,Erzählen Sie uns von Ihren Erlebnissen rund um...,User,0.000322,0.99557,3.4e-05,0.003321,6.1e-05,0.000692
4,6,2014-09-30 09:56:00.00,Ihre schlimmsten Hotelerlebnisse,"Der Sommer ist vorbei, und für die meisten von...",User,0.0004,0.99933,8.7e-05,6.8e-05,1.7e-05,9.8e-05


In [7]:
articles[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.094497,0.591266,0.004976,0.023629,0.018213,0.267419
std,0.177733,0.447491,0.03958,0.098098,0.09714,0.352347
min,2.3e-05,3.7e-05,8e-06,1.3e-05,9e-06,1.7e-05
max,0.975382,0.999906,0.975875,0.999537,0.998976,0.997223


In [8]:
descriptive_statistics(articles)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.14343,0.237387,0.406217,0.455393,0.005058,0.046711,0.02169,0.092309,0.019254,0.096325,0.404351,0.402947
1,Inland,0.137242,0.220266,0.593662,0.439469,0.001142,0.007812,0.008124,0.054927,0.007148,0.059784,0.252681,0.342998
2,International,0.072969,0.135488,0.697967,0.399398,0.006475,0.035495,0.015072,0.06738,0.003032,0.024192,0.204486,0.306951
3,Kultur,0.075244,0.174819,0.636156,0.446905,0.005142,0.04363,0.076649,0.212282,0.018125,0.096066,0.188685,0.323384
4,Panorama,0.068119,0.135357,0.716853,0.392634,0.017622,0.081458,0.02572,0.100432,0.009326,0.074992,0.16236,0.270381
5,Sport,0.090436,0.142719,0.270982,0.407466,0.001312,0.008177,0.073426,0.15728,0.08493,0.203081,0.478915,0.364381
6,User,0.163283,0.323752,0.65187,0.460887,0.000671,0.004062,0.012619,0.075804,0.051163,0.203812,0.120394,0.26726
7,Web,0.084259,0.172779,0.614061,0.446231,0.001162,0.01137,0.00492,0.034449,0.010814,0.062058,0.284785,0.378412
8,Wirtschaft,0.135695,0.208267,0.56901,0.446792,0.00039,0.00094,0.00728,0.039023,0.003331,0.025235,0.284294,0.348087
9,Wissenschaft,0.032206,0.091646,0.753691,0.400655,0.001816,0.01294,0.013608,0.07241,0.01232,0.078315,0.186359,0.330282


posts

In [9]:
posts = pd.read_sql_query("SELECT * FROM Posts", con)
posts.head(1)

Unnamed: 0,ID_Post,ID_Parent_Post,ID_Article,ID_User,CreatedAt,Status,Headline,Body,PositiveVotes,NegativeVotes
0,1,,1,9089,2003-04-23 14:52:41.870,deleted,,,0,0


In [10]:
posts = posts[['ID_Article', 'CreatedAt', 'Body']].copy()
posts['NewsroomTopic'] = posts['ID_Article'].map(articles.set_index('ID_Article')['NewsroomTopic'])
display(posts.iloc[56789]['Body'])

'"dezente" hinweise auf dem bild erkennbar ;)'

In [14]:
posts = sentiment_analysis(df=posts, column='Body')
posts.head()

Unnamed: 0,ID_Article,CreatedAt,Body,NewsroomTopic,Anger,Fear,Disgust,Sadness,Joy,None
0,1,2003-04-23 14:52:41.870,,User,0.207347,0.06392,0.140161,0.161009,0.147318,0.280244
1,1,2003-11-04 16:21:57.850,Ich bin begeistert von den STANDARD - Newslett...,User,0.000122,5.6e-05,5.4e-05,8.2e-05,0.999291,0.000395
2,1,2004-01-28 12:57:28.240,... Aber momentan funktioniert das Abmelden od...,User,0.145751,0.000249,0.00027,0.004519,0.0323,0.816911
3,1,2004-02-03 20:32:39.123,Es ist ganz einfach nervend!\r\nVor kurzem hab...,User,0.000122,0.999358,2.5e-05,0.00043,2.1e-05,4.4e-05
4,1,2004-03-02 11:37:44.100,und sie als mitarbeiter sind natuerlich objektiv,User,0.146457,4.1e-05,0.001272,0.005461,0.027897,0.818871


In [15]:
posts[list(labels.values())].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0,Anger,Fear,Disgust,Sadness,Joy,None
mean,0.485507,0.180128,0.015334,0.032069,0.044356,0.242605
std,0.416307,0.366255,0.103841,0.120348,0.182674,0.338025
min,2.1e-05,4e-06,7e-06,1.1e-05,7e-06,1.4e-05
max,0.997326,0.999911,0.999898,0.99976,0.999762,0.998009


In [16]:
descriptive_statistics(posts)

Unnamed: 0,NewsroomTopic,Anger_mean,Anger_std,Fear_mean,Fear_std,Disgust_mean,Disgust_std,Sadness_mean,Sadness_std,Joy_mean,Joy_std,None_mean,None_std
0,Etat,0.506187,0.416396,0.146678,0.335943,0.023099,0.129029,0.035015,0.126934,0.053608,0.203371,0.235412,0.333628
1,Inland,0.51936,0.416139,0.171624,0.359131,0.013447,0.097735,0.029434,0.111263,0.033822,0.157249,0.232312,0.331188
2,International,0.509242,0.415038,0.188771,0.372763,0.014199,0.097437,0.029442,0.11122,0.027155,0.140587,0.231191,0.329776
3,Kultur,0.383903,0.407322,0.133085,0.324198,0.021063,0.119093,0.073721,0.208308,0.12355,0.303809,0.264679,0.355554
4,Panorama,0.487922,0.41647,0.193276,0.377209,0.018687,0.115211,0.032872,0.122016,0.034112,0.15972,0.233131,0.332541
5,Sport,0.450376,0.409002,0.09706,0.280275,0.012545,0.091025,0.046634,0.149706,0.093265,0.260954,0.30012,0.361046
6,User,0.411534,0.415517,0.240303,0.408671,0.017225,0.107661,0.046812,0.162233,0.062577,0.219842,0.221549,0.333098
7,Web,0.462211,0.411478,0.163256,0.350438,0.014898,0.103491,0.025978,0.104329,0.054687,0.202109,0.27897,0.355782
8,Wirtschaft,0.517041,0.416246,0.190077,0.372728,0.008518,0.076049,0.024391,0.099758,0.030003,0.147692,0.22997,0.330072
9,Wissenschaft,0.41404,0.407111,0.196434,0.380985,0.02116,0.123456,0.035225,0.127431,0.053532,0.202208,0.279609,0.358449


Prepare files for analysis

In [None]:
articles.rename(columns={
    "Anger": "anger_a",
    "Fear": "fear_a",
    "Disgust": "disgust_a",
    "Sadness": "sadness_a",
    "Joy": "joy_a",
    "None": "none_a"
}, inplace=True)

posts.rename(columns={
    "Anger": "anger_p",
    "Fear": "fear_p",
    "Disgust": "disgust_p",
    "Sadness": "sadness_p",
    "Joy": "joy_p",
    "None": "none_p"
}, inplace=True)

data = pd.merge(articles, posts, on=["ID_Article", "NewsroomTopic"], how="right")

data["publishingDate"] = pd.to_datetime(data['publishingDate'])
data["CreatedAt"] = pd.to_datetime(data['CreatedAt'])

internal_articles = data['Title'].str.contains('derStandard.at', case=False, na=False)
data = data.dropna().loc[~internal_articles].reset_index(drop=True)

data = data[["CreatedAt", "anger_p", "fear_p", "disgust_p", "sadness_p", "joy_p", "none_p", "NewsroomTopic",
             "ID_Article", "publishingDate", "anger_a", "fear_a", "disgust_a", "sadness_a", "joy_a", "none_a"]].copy()

data


In [None]:
data.to_csv("combined_sentiments.csv", index=False)