In [None]:
from google.colab import drive, userdata
from huggingface_hub import login

drive.mount('/content/drive')
login(userdata.get('HF_TOKEN'))

Mounted at /content/drive


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import pandas as pd
import numpy as np

In [None]:
# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
roberta = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(roberta)
config = AutoConfig.from_pretrained(roberta)
r_model = AutoModelForSequenceClassification.from_pretrained(roberta)
# r_model.save_pretrained(f"/content/drive/MyDrive/Uni stuff/Evidence-based/{roberta}")

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def compute_sentiment(model, encoded_input):
    try:
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        top_score = np.max(scores)
        sentiment = config.id2label[np.argmax(scores)]
        return sentiment, top_score
    except Exception as e:
        print(f"Error {e} processing encoded input: {encoded_input}")
        return None, None

def handle_long_text(model, text):
    max_length = 512
    encoded_input = tokenizer(text, return_tensors='pt')
    input_ids = encoded_input['input_ids'][0]

    if len(input_ids) > max_length:
        parts = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
        sentiments = []
        scores = []
        for part in parts:
            part_encoded_input = {'input_ids': part.unsqueeze(0)}
            sentiment, score = compute_sentiment(model, part_encoded_input)
            sentiments.append(sentiment)
            scores.append(score)

        sentiment = max(set(sentiments), key=sentiments.count)
        score = max(scores)
        return sentiment, score
    else:
        return compute_sentiment(model, encoded_input)

In [None]:
csv_file = "/content/drive/MyDrive/Uni stuff/Evidence-based/Dataset/satd-different-sources-data/satd-dataset-issues.csv"
data = pd.read_csv(csv_file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23180 entries, 0 to 23179
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   project         23180 non-null  object
 1   issue_number    23180 non-null  int64 
 2   issue_type      23180 non-null  object
 3   text            23036 non-null  object
 4   classification  23180 non-null  object
 5   indicator       23180 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.1+ MB


In [None]:
data.head()

Unnamed: 0,project,issue_number,issue_type,text,classification,indicator
0,camel,10009,comment_0,If you use then it works,non_debt,-
1,camel,10009,description,If you use spring and then refer to an endpoin...,non_debt,-
2,camel,10009,summary,Using <to> with id and ref fails,non_debt,-
3,camel,10022,comment_0,There is a basic health indicator now.,non_debt,-
4,camel,10022,comment_1,We need the health check API in camel-core for...,non_debt,-


In [None]:
data = data.dropna(subset=['text'])
test_df = data.sample(n=30, random_state=1)

In [None]:
test_df[['sentiment', 'score']] = test_df['text'].apply(lambda x: pd.Series(handle_long_text(r_model, x)))
test_df.head()

Unnamed: 0,project,issue_number,issue_type,text,classification,indicator,sentiment,score
18508,impala,1027,comment_0,"this is only for CTAS, right?",non_debt,-,neutral,0.940757
8003,gerrit,6215,description,1. View a file that has a collapsed section an...,non_debt,-,negative,0.609213
12604,hadoop,7684,comment_7,rpm package doesnt seem to include the history...,code_debt,low_quality_code,neutral,0.509126
14430,hbase,12833,comment_16,"I guess it could be, but won't connections sti...",test_debt,low_coverage,negative,0.550618
17600,hbase,6184,comment_8,This can be happened when region split. 0.94.x...,non_debt,-,neutral,0.755957


In [None]:
chunk_size = 50
num_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size != 0 else 0)

for i in range(num_chunks):
    chunk = data.iloc[i * chunk_size:(i + 1) * chunk_size].copy()
    chunk[['sentiment', 'score']] = chunk['text'].apply(lambda x: pd.Series(handle_long_text(x)))
    if i == 0:
        chunk.to_csv(csv_file, index=False, mode='w', header=True)
    else:
        chunk.to_csv(csv_file, index=False, mode='a', header=False)
    print(f'Processed chunk {i + 1}/{num_chunks}')

Processed chunk 1/461
Processed chunk 1/461
Processed chunk 2/461
Processed chunk 2/461
Processed chunk 3/461
Processed chunk 3/461
Processed chunk 4/461
Processed chunk 4/461
Processed chunk 5/461
Processed chunk 5/461
Processed chunk 6/461
Processed chunk 6/461
Processed chunk 7/461
Processed chunk 7/461
Processed chunk 8/461
Processed chunk 8/461
Processed chunk 9/461
Processed chunk 9/461
Processed chunk 10/461
Processed chunk 10/461
Processed chunk 11/461
Processed chunk 11/461
Processed chunk 12/461
Processed chunk 12/461
Processed chunk 13/461
Processed chunk 13/461
Processed chunk 14/461
Processed chunk 14/461
Processed chunk 15/461
Processed chunk 15/461
Processed chunk 16/461
Processed chunk 16/461
Processed chunk 17/461
Processed chunk 17/461
Processed chunk 18/461
Processed chunk 18/461
Processed chunk 19/461
Processed chunk 19/461
Processed chunk 20/461
Processed chunk 20/461
Processed chunk 21/461
Processed chunk 21/461
Processed chunk 22/461
Processed chunk 22/461
Proces

In [None]:
data = pd.read_csv(csv_file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23036 entries, 0 to 23035
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         23036 non-null  object 
 1   issue_number    23036 non-null  int64  
 2   issue_type      23036 non-null  object 
 3   text            23036 non-null  object 
 4   classification  23036 non-null  object 
 5   indicator       23036 non-null  object 
 6   sentiment       23036 non-null  object 
 7   score           23036 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB
