In [22]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd

task='sentiment'
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Load data

In [3]:
text_df = pd.read_json("raw_data/train_text.json",lines=True)
label_df = pd.read_json("raw_data/train_truth.json", lines=True)

df = pd.concat([text_df,label_df],axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   twitter user id  160 non-null    object
 1   texts            160 non-null    object
 2   tweet ids        160 non-null    object
 3   twitter user id  160 non-null    object
 4   class            160 non-null    object
dtypes: object(5)
memory usage: 6.4+ KB


In [5]:
df = df.drop(df.columns[[0,2,3]], axis=1)

In [7]:
tweet = []
label = []
for index, row in df.iterrows():
    for text in row['texts']:
        tweet.append(text['text'])
        label.append(row['class'])

In [8]:
tweet = pd.DataFrame(tweet)
label = pd.DataFrame(label)

In [9]:
sent_df = pd.concat([tweet,label],axis=1)

In [17]:
sent_df.columns = ["tweet", "label"]

In [18]:
sent_df

Unnamed: 0,tweet,label
0,RT @AroundMyCitys: #Ape $Ape\n\nGive some of t...,nano
1,@shawndaddio I can’t see a single valid reason...,nano
2,RT @quasimondo: Now we just need to knock our ...,nano
3,☝️would love some input from #tezos #cleannft ...,nano
4,RT @momomeatmaker: Fresh drop 💜\n- 3 Men Pleas...,nano
...,...,...
924,RT @CryptoAnglio: Ok giving away 0.5 $SOL and ...,nano
925,RT @iamdagtw: Is not $SOL vs $ETH \nIs $SOL 🤝 ...,nano
926,RT @YakuCorp: 🎁 Vanguards x Yaku Corp 🎁 \n\nPr...,nano
927,RT @MagicDegenSOL: Giving away a y00ts t00b NF...,nano


In [11]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [12]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')

labels = [row[1] for row in csvreader if len(row) > 1]

In [13]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [26]:
def sentiment_analyze(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    label = np.argmax(scores)
    return label

predicted_sentiment = []
for tweet in sent_df['tweet']:
    predicted_sentiment.append(sentiment_analyze(tweet))

In [32]:
sent_df['sentiment'] = predicted_sentiment

In [33]:
sent_df

Unnamed: 0,tweet,label,sentiment
0,RT @AroundMyCitys: #Ape $Ape\n\nGive some of t...,nano,2
1,@shawndaddio I can’t see a single valid reason...,nano,0
2,RT @quasimondo: Now we just need to knock our ...,nano,2
3,☝️would love some input from #tezos #cleannft ...,nano,2
4,RT @momomeatmaker: Fresh drop 💜\n- 3 Men Pleas...,nano,2
...,...,...,...
924,RT @CryptoAnglio: Ok giving away 0.5 $SOL and ...,nano,2
925,RT @iamdagtw: Is not $SOL vs $ETH \nIs $SOL 🤝 ...,nano,1
926,RT @YakuCorp: 🎁 Vanguards x Yaku Corp 🎁 \n\nPr...,nano,1
927,RT @MagicDegenSOL: Giving away a y00ts t00b NF...,nano,1


In [60]:
contigency_table = pd.crosstab(index=sent_df['label'], columns=sent_df['sentiment'])

In [61]:
contigency_table

sentiment,0,1,2
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
macro,24,115,90
mega,26,110,122
micro,10,106,71
nano,7,96,58
no influencer,10,45,39


In [56]:
from scipy.stats import chi2_contingency

In [57]:
chi2, p, dof, expected = chi2_contingency(contigency_table.to_numpy())

print(f"chi2 statistic:     {chi2:.5g}")
print(f"p-value:            {p:.5g}")
print(f"degrees of freedom: {dof}")
print("expected frequencies:")
print(expected)

chi2 statistic:     19.272
p-value:            0.013469
degrees of freedom: 8
expected frequencies:
[[ 18.98062433 116.34876211  93.67061356]
 [ 21.38428418 131.08288482 105.532831  ]
 [ 15.49946179  95.00968784  76.49085038]
 [ 13.3444564   81.79978471  65.85575888]
 [  7.7911733   47.75888052  38.44994618]]


In [58]:
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
    
    print('Dependent (reject H0)')
    
else:
    print('Independent (H0 holds true)')

p value is 0.013469076520690551
Dependent (reject H0)
