In [2]:
import pandas as pd
import numpy as np
from transformers import pipeline

# TqdmWarning: IProgress not found.

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [6]:
task='irony'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [7]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Great, it broke the first day..."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [22]:
ranking = np.argsort(scores)
ranking = ranking[::-1]

In [45]:
def predict(model, tokenizer, text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    scores_array = []
    for i in range(scores.shape[0]):
        scores_dict = {}
        l = labels[ranking[i]]
        scores_dict['label'] = l
        s = scores[ranking[i]]
        scores_dict['score'] = s
        scores_array.append(scores_dict)
    return scores_array

In [46]:
# data_path
data_path = "../data/reporting/emotion_data.csv"

# read data
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,post_travel,post_abortion,adm,riv,gender,gender_3_text,age,ethnic_background,ethnic_background_8_text,education,...,instagram,tiktok,linkedin,pinterest,other,other_portals_7_text,post_ai,tr_emotion,ab_emotion,ai_emotion
0,went on a sick trip somewhere really fun and c...,anyone know where to buy new hangers?,1.222,1.667,2.0,,38,1.0,,4.0,...,1.0,1.0,,,,,ai is weird. i want it to do menial tasks not ...,"[[{'label': 'joy', 'score': 0.950767457485199}...","[[{'label': 'neutral', 'score': 0.701108157634...","[[{'label': 'disgust', 'score': 0.564288496971..."
1,I went to the beach this weekend and had a gre...,Banning abortion is telling someone else what ...,5.778,1.222,1.0,,45,1.0,,2.0,...,1.0,1.0,1.0,,,,OpenAI will contribute many great things to ou...,"[[{'label': 'joy', 'score': 0.99103844165802},...","[[{'label': 'disgust', 'score': 0.591954231262...","[[{'label': 'neutral', 'score': 0.799880325794..."
2,I had a beautiful time in Croatia with my fami...,I'm deeply disappointed to learn that the gove...,4.333,1.889,2.0,,34,5.0,,1.0,...,1.0,,1.0,,,,I've found AI to be super helpful in my recent...,"[[{'label': 'joy', 'score': 0.9921907782554626...","[[{'label': 'sadness', 'score': 0.884934008121...","[[{'label': 'joy', 'score': 0.6637662649154663..."
3,On vacation - this place is amazing! I definit...,I’m devastated that this has happened - women ...,2.444,2.222,2.0,,41,1.0,,2.0,...,1.0,,1.0,1.0,,,Used an AI service to make some cute and funny...,"[[{'label': 'joy', 'score': 0.7306950092315674...","[[{'label': 'sadness', 'score': 0.951866924762...","[[{'label': 'joy', 'score': 0.5088626146316528..."
4,Last week I had the pleasure of traveling up a...,It is unconstitutional to deny a woman's right...,2.889,2.222,2.0,,36,4.0,,2.0,...,,,1.0,1.0,,,AI should be closely monitored and ensured tha...,"[[{'label': 'joy', 'score': 0.9787710309028625...","[[{'label': 'anger', 'score': 0.47481924295425...","[[{'label': 'neutral', 'score': 0.853263556957..."


In [47]:
text = data["post_abortion"][2]
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [48]:
# add new column to data
data["tr_irony"] = data["post_travel"].apply(lambda x: predict(model, tokenizer, x))
data["ab_irony"] = data["post_abortion"].apply(lambda x: predict(model, tokenizer, x))
data["ai_irony"] = data["post_ai"].apply(lambda x: predict(model, tokenizer, x))

In [49]:
# save data into reporting folder
data.to_csv("../data/reporting/em_ir_data.csv", index=False)

In [51]:
def get_column_narc_count(data, narc_type, column_type, low = True, n = 100):
    temp_data = data.sort_values(narc_type, ascending=low).head(n)
    min = temp_data[narc_type].min()
    max = temp_data[narc_type].max()
    temp_data = temp_data[[column_type]]
    emotion_count = {}
    for i in range(n):
        label  = temp_data.iloc[i,0][0]['label']
        if label in emotion_count:
            emotion_count[label] += 1
        else:
            emotion_count[label] = 1
    #sort emotion count
    emotion_count = dict(sorted(emotion_count.items(), key=lambda item: item[1], reverse=True))

    return emotion_count, min, max

def print_column_narc_count(data, narc_type, column_type, n = 100):
    emotion_count, min, max = get_column_narc_count(data, narc_type, column_type,low = False, n = n)
    print(f"Emotion count for {n} highest {narc_type} {column_type} posts")
    print(f"Min {narc_type}: {min}")
    print(f"Max {narc_type}: {max}")
    print(emotion_count)
    emotion_count, min, max = get_column_narc_count(data, narc_type, column_type,low = True, n = n)
    print("\n")
    print(f"Emotion count for {n} lowest {narc_type} {column_type} posts")
    print(f"Min {narc_type}: {min}")
    print(f"Max {narc_type}: {max}")
    print(emotion_count)

In [56]:
print_column_narc_count(data,'adm','tr_irony', n = 100)

Emotion count for 100 highest adm tr_irony posts
Min adm: 4.0
Max adm: 5.889
{'non_irony': 86, 'irony': 14}


Emotion count for 100 lowest adm tr_irony posts
Min adm: 1.111
Max adm: 2.556
{'non_irony': 93, 'irony': 7}


In [58]:
print_column_narc_count(data,'adm','ab_irony', n = 100)
print("\n")
print_column_narc_count(data,'adm','ai_irony', n = 100)

Emotion count for 100 highest adm ab_irony posts
Min adm: 4.0
Max adm: 5.889
{'non_irony': 69, 'irony': 31}


Emotion count for 100 lowest adm ab_irony posts
Min adm: 1.111
Max adm: 2.556
{'non_irony': 70, 'irony': 30}


Emotion count for 100 highest adm ai_irony posts
Min adm: 4.0
Max adm: 5.889
{'non_irony': 58, 'irony': 42}


Emotion count for 100 lowest adm ai_irony posts
Min adm: 1.111
Max adm: 2.556
{'non_irony': 61, 'irony': 39}


In [59]:
print_column_narc_count(data,'riv','tr_irony', n = 100)

Emotion count for 100 highest riv tr_irony posts
Min riv: 2.333
Max riv: 5.111
{'non_irony': 86, 'irony': 14}


Emotion count for 100 lowest riv tr_irony posts
Min riv: 1.0
Max riv: 1.556
{'non_irony': 93, 'irony': 7}


In [60]:
print_column_narc_count(data,'riv','ab_irony', n = 100)
print("\n")
print_column_narc_count(data,'riv','ai_irony', n = 100)

Emotion count for 100 highest riv ab_irony posts
Min riv: 2.333
Max riv: 5.111
{'non_irony': 70, 'irony': 30}


Emotion count for 100 lowest riv ab_irony posts
Min riv: 1.0
Max riv: 1.556
{'non_irony': 76, 'irony': 24}


Emotion count for 100 highest riv ai_irony posts
Min riv: 2.333
Max riv: 5.111
{'non_irony': 61, 'irony': 39}


Emotion count for 100 lowest riv ai_irony posts
Min riv: 1.0
Max riv: 1.556
{'non_irony': 59, 'irony': 41}


## Hate

In [62]:
task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [63]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [64]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Great, it broke the first day..."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

In [69]:
data["tr_hate"] = data["post_travel"].apply(lambda x: predict(model, tokenizer, x))
data["ab_hate"] = data["post_abortion"].apply(lambda x: predict(model, tokenizer, x))
data["ai_hate"] = data["post_ai"].apply(lambda x: predict(model, tokenizer, x))

In [70]:
# save data into reporting folder
data.to_csv("../data/reporting/em_ir_ht_data.csv", index=False)

In [73]:
print_column_narc_count(data,'adm','tr_hate', n = 100)
print("\n")
print_column_narc_count(data,'adm','ab_hate', n = 100)
print("\n")
print_column_narc_count(data,'adm','ai_hate', n = 100)

Emotion count for 100 highest adm tr_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-hate': 100}


Emotion count for 100 lowest adm tr_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-hate': 100}


Emotion count for 100 highest adm ab_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-hate': 99, 'hate': 1}


Emotion count for 100 lowest adm ab_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-hate': 98, 'hate': 2}


Emotion count for 100 highest adm ai_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-hate': 100}


Emotion count for 100 lowest adm ai_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-hate': 100}


In [74]:
print_column_narc_count(data,'riv','tr_hate', n = 100)
print("\n")
print_column_narc_count(data,'riv','ab_hate', n = 100)
print("\n")
print_column_narc_count(data,'riv','ai_hate', n = 100)

Emotion count for 100 highest riv tr_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-hate': 100}


Emotion count for 100 lowest riv tr_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-hate': 100}


Emotion count for 100 highest riv ab_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-hate': 97, 'hate': 3}


Emotion count for 100 lowest riv ab_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-hate': 99, 'hate': 1}


Emotion count for 100 highest riv ai_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-hate': 100}


Emotion count for 100 lowest riv ai_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-hate': 100}


## Offensive

In [76]:
task='offensive'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [77]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [78]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.82735324, 0.17264678], dtype=float32)

In [79]:
predict(model, tokenizer, data["post_travel"][0])

[{'label': 'not-offensive', 'score': 0.6869133},
 {'label': 'offensive', 'score': 0.31308672}]

In [80]:
data["tr_hate"] = data["post_travel"].apply(lambda x: predict(model, tokenizer, x))
data["ab_hate"] = data["post_abortion"].apply(lambda x: predict(model, tokenizer, x))
data["ai_hate"] = data["post_ai"].apply(lambda x: predict(model, tokenizer, x))

In [81]:
# save data into reporting folder
data.to_csv("../data/reporting/four_classes_data.csv", index=False)

In [82]:
print_column_narc_count(data,'adm','tr_hate', n = 100)
print("\n")
print_column_narc_count(data,'adm','ab_hate', n = 100)
print("\n")
print_column_narc_count(data,'adm','ai_hate', n = 100)

Emotion count for 100 highest adm tr_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-offensive': 100}


Emotion count for 100 lowest adm tr_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-offensive': 99, 'offensive': 1}


Emotion count for 100 highest adm ab_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-offensive': 84, 'offensive': 16}


Emotion count for 100 lowest adm ab_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-offensive': 64, 'offensive': 36}


Emotion count for 100 highest adm ai_hate posts
Min adm: 4.0
Max adm: 5.889
{'not-offensive': 98, 'offensive': 2}


Emotion count for 100 lowest adm ai_hate posts
Min adm: 1.111
Max adm: 2.556
{'not-offensive': 98, 'offensive': 2}


In [83]:
print_column_narc_count(data,'riv','tr_hate', n = 100)
print("\n")
print_column_narc_count(data,'riv','ab_hate', n = 100)
print("\n")
print_column_narc_count(data,'riv','ai_hate', n = 100)

Emotion count for 100 highest riv tr_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-offensive': 100}


Emotion count for 100 lowest riv tr_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-offensive': 100}


Emotion count for 100 highest riv ab_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-offensive': 78, 'offensive': 22}


Emotion count for 100 lowest riv ab_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-offensive': 78, 'offensive': 22}


Emotion count for 100 highest riv ai_hate posts
Min riv: 2.333
Max riv: 5.111
{'not-offensive': 96, 'offensive': 4}


Emotion count for 100 lowest riv ai_hate posts
Min riv: 1.0
Max riv: 1.556
{'not-offensive': 96, 'offensive': 4}
