In [None]:
!pip install --q flair

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from flair.data import Sentence
from flair.embeddings import StackedEmbeddings, TransformerDocumentEmbeddings, DocumentPoolEmbeddings, FlairEmbeddings
import pandas as pd
import numpy as np
from sklearn import model_selection, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
filename =r'/Dataset/GermEval21_Toxic_Train/GermEval21_Toxic_Train.csv'
data=pd.read_csv(filename)
# df1=data[['comment_text','Sub1_Toxic']]
# df2=data[['comment_text','Sub2_Engaging']]
# df3=data[['comment_text','Sub3_FactClaiming']]

In [None]:
#initialize transformer embeddings -roberta
transformer_embedding = TransformerDocumentEmbeddings('xlm-roberta-base')  #xlm-large is too big for memory

#initialize transformer embeddings -German bert
transformer_embedding_1 = TransformerDocumentEmbeddings('bert-base-german-cased')  #xlm-large is too big for memory


#initialize flair embeddings
flair_embedding_forward = FlairEmbeddings('de-forward');
flair_embedding_backward = FlairEmbeddings('de-backward');
document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward, flair_embedding_backward]) ;


In [None]:
df1['flair_embeddings']= " "
#create features for tweets_training for Model 2.2 FLair embeddings
for i in range(0,len(df1)):

    # create a sentence
    sentence = Sentence(df1['comment_text'][i])

    # embed the sentence
    document_embeddings.embed(sentence)

    embedding = sentence.embedding.cpu()
    
    #save vector as numpy
    embedding = embedding.detach().numpy()
    
    #save vector as pandas dataframe
    embedding = pd.DataFrame(embedding)
    
    #make list out of sentence
    embedding = embedding[0].tolist()

    #add the embedding vector to the column of stacked embeddings
    df1['flair_embeddings'][i] = embedding

In [None]:
df1['roberta_embeddings']= " "
for i in range(0,len(df1)):

    # create a sentence
    sentence = Sentence(df1['comment_text'][i])

    # embed the sentence
    transformer_embedding.embed(sentence)

    embedding =sentence.embedding.cpu()

    #save vector as numpy
    embedding = embedding.detach().numpy()
    
    # #save vector as pandas dataframe
    embedding = pd.DataFrame(embedding)
    
    # #make list out of sentence
    embedding = embedding[0].tolist()

    # #add the embedding vector to the column of stacked embeddings
    df1['roberta_embeddings'][i] = embedding

In [None]:
train['gbert_embeddings'] =''
for i in range(0,len(train)):

    # create a sentence
    sentence = Sentence(train['comment_text'].iloc[i])

    # embed the sentence
    transformer_embedding_1.embed(sentence)

    embedding =sentence.embedding.cpu()

    #save vector as numpy
    embedding = embedding.detach().numpy()
    
    # #save vector as pandas dataframe
    embedding = pd.DataFrame(embedding)
    
    # #make list out of sentence
    embedding = embedding[0].tolist()

    # #add the embedding vector to the column of stacked embeddings
    train['gbert_embeddings'].iloc[i] = embedding

In [None]:
model1 = svm.SVC(C=1, kernel='linear', gamma=1)
model2 = svm.SVC(C=1, kernel='linear', gamma=1)
model3 = svm.SVC(C=1, kernel='linear', gamma=1)

In [None]:
model1.fit(list(df1['flair_embeddings']),df1['Sub1_Toxic'])
model2.fit(list(df1['roberta_embeddings']),df1['Sub1_Toxic'])
model3.fit(list(df1['gbert_embeddings']),df1['Sub1_Toxic'])

In [None]:
model1_pred = model1.predict(list(df1['flair_embeddings']))
model2_pred = model2.predict(list(df1['roberta_embeddings']))
model3_pred= model3.predict(list(df1['gbert_embeddings']))

In [None]:
df1['flair_predictions']=model1_pred
df1['roberta_predictions']=model2_pred
df1['gbert_predictions']=model3_pred

In [None]:
print('Metrics for flair\n')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(df1['Sub1_Toxic'], df1['flair_predictions'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(df1['Sub1_Toxic'], df1['flair_predictions'] ,average='macro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(df1['Sub1_Toxic'], df1['flair_predictions'] ,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(df1['Sub1_Toxic'], df1['flair_predictions'])
# print('F1 score: %f' % f1)

if precision+recall > 0:
  f1_score = 2*precision*recall/(precision+recall)
print('F1 score: %f' % f1_score)

In [None]:
print('Metrics for roberta\n')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(df1['Sub1_Toxic'], df1['roberta_predictions'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(df1['Sub1_Toxic'], df1['roberta_predictions'] ,average='macro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(df1['Sub1_Toxic'], df1['roberta_predictions'] ,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(df1['Sub1_Toxic'], df1['roberta_predictions'])
# print('F1 score: %f' % f1)

if precision+recall > 0:
  f1_score = 2*precision*recall/(precision+recall)
print('F1 score: %f' % f1_score)

In [None]:
# accuracy: (tp + tn) / (p + n)
print('Metrics for gbert\n')
accuracy = accuracy_score(df1['Sub1_Toxic'], df1['gbert_predictions'])
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(df1['Sub1_Toxic'], df1['gbert_predictions'] ,average='macro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(df1['Sub1_Toxic'], df1['gbert_predictions'] ,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(df1['Sub1_Toxic'], df1['gbert_predictions'])
# print('F1 score: %f' % f1)

if precision+recall > 0:
  f1_score = 2*precision*recall/(precision+recall)
print('F1 score: %f' % f1_score)


In [None]:
#Metrics for Ensemble
# accuracy: (tp + tn) / (p + n)
print('Metrics for Ensemble\n')
accuracy = accuracy_score(df1['Sub1_Toxic'], df1['Ensemble'] )
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)'Sub1_Toxic'
precision = precision_score(df1['Sub1_Toxic'], df1['Ensemble'] ,average='macro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(df1['Sub1_Toxic'], df1['Ensemble'] ,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(df1['Sub1_Toxic'], df1['Ensemble'])
# print('F1 score: %f' % f1)

if precision+recall > 0:
  f1_score = 2*precision*recall/(precision+recall)
print('F1 score: %f' % f1_score)