In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from google.colab import files
import shutil

In [3]:
#loading the subset of 2000 posts for evaluation
dataframe_posts = pd.read_excel("Evaluationsdatensatz.xlsx")

In [4]:
print(dataframe_posts.head())

   Unnamed: 0           id                                               body  \
0       26492  DB8seh6tHE4  Ich habe großen Respekt davor, wenn Menschen a...   
1      304154  BTY30m2luaB  #streetart #Frankfurt am Main. \n#occupy #occu...   
2       34974  C-C715BoPFa  Sigmar #Gabriel war Ministerpräsident von Nied...   
3      528626  CbzjzXivCiX  Das ist Einsatz: Frans Timmermans ist heute zu...   
4      299924  B_2l6BogRvd  Ausschusssitzung aus dem #Büro in #Berlin Gut...   

             author_fullname                                    hashtags  \
0               Ricarda Lang                                         NaN   
1        Anke Domscheit-Berg  streetart,Frankfurt,occupy,occupyfrankfurt   
2  Julia Klöckner (she/her)  Gabriel,Atlantik-,Brücke,USA,Transatlantik   
3            Martin Rosemann                  Verpackungsteuer,Euhandelt   
4            Jens Zimmermann                 Büro,Berlin,Kaffeemaschine   

   annotator_1  annotator_2  Unnamed: 7  Unnamed: 8  \
0

In [5]:
shutil.unpack_archive("gbert_finetuned_twitter.zip", "/content/gbert_finetuned_twitter")

In [7]:
#loading GBERT1 model
model_path_gbert1 = "/content/gbert_finetuned_twitter"
tokenizer1 = AutoTokenizer.from_pretrained(model_path_gbert1)
model1 = AutoModelForSequenceClassification.from_pretrained(model_path_gbert1)

In [8]:
shutil.unpack_archive("gbert_finetuned_twitter&germeval17.zip", "/content/gbert_finetuned_twitter&germeval17")

In [9]:
#loading GBERT2 model
model_path_gbert2 = "/content/gbert_finetuned_twitter&germeval17"
tokenizer2 = AutoTokenizer.from_pretrained(model_path_gbert2)
model2 = AutoModelForSequenceClassification.from_pretrained(model_path_gbert2)

In [13]:
#determine sentiment of subset posts with both models
def predict_sentiment(model, tokenizer, text):

    if not isinstance(text, (str, list)):
        text = str(text)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_label = torch.argmax(probabilities, dim=-1).item()


    label_mapping = {0: "positive", 1: "negative", 2: "neutral"}
    return label_mapping[predicted_label]

In [14]:
dataframe_posts["GBERT1"] = dataframe_posts["body"].apply(lambda x: predict_sentiment(model1, tokenizer1, x))

In [15]:
dataframe_posts["GBERT2"] = dataframe_posts["body"].apply(lambda x: predict_sentiment(model2, tokenizer2, x))

In [16]:
print(dataframe_posts.head())

   Unnamed: 0           id                                               body  \
0       26492  DB8seh6tHE4  Ich habe großen Respekt davor, wenn Menschen a...   
1      304154  BTY30m2luaB  #streetart #Frankfurt am Main. \n#occupy #occu...   
2       34974  C-C715BoPFa  Sigmar #Gabriel war Ministerpräsident von Nied...   
3      528626  CbzjzXivCiX  Das ist Einsatz: Frans Timmermans ist heute zu...   
4      299924  B_2l6BogRvd  Ausschusssitzung aus dem #Büro in #Berlin Gut...   

             author_fullname                                    hashtags  \
0               Ricarda Lang                                         NaN   
1        Anke Domscheit-Berg  streetart,Frankfurt,occupy,occupyfrankfurt   
2  Julia Klöckner (she/her)  Gabriel,Atlantik-,Brücke,USA,Transatlantik   
3            Martin Rosemann                  Verpackungsteuer,Euhandelt   
4            Jens Zimmermann                 Büro,Berlin,Kaffeemaschine   

   annotator_1  annotator_2  Unnamed: 7  Unnamed: 8  \
0

In [17]:
#add results of labeling to dataframe, save as csv- and excel-file
dataframe_posts.to_csv("Evaluationstexte_mit_Modellergebnissen.csv", index=False)

In [18]:
files.download("Evaluationstexte_mit_Modellergebnissen.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
pip install openpyxl



In [22]:
dataframe_posts.to_excel("Evaluationstexte_mit_Modellergebnissen.xlsx", index=False, engine="openpyxl")

In [23]:
files.download("Evaluationstexte_mit_Modellergebnissen.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>