In [None]:
from google.colab import drive, userdata
from huggingface_hub import login

drive.mount('/content/drive')
login(userdata.get('HF_TOKEN'))

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from openpyxl import load_workbook
from sklearn.metrics import confusion_matrix
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [None]:
# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
roberta = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(roberta)
config = AutoConfig.from_pretrained(roberta)
r_model = AutoModelForSequenceClassification.from_pretrained(roberta)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def compute_sentiment(encoded_input):
    try:
        output = r_model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        top_score = np.max(scores)
        sentiment = config.id2label[np.argmax(scores)]
        return sentiment
    except Exception as e:
        print(f"Error {e} processing encoded input: {encoded_input}")
        return None

def handle_long_text(text):
    max_length = 512
    encoded_input = tokenizer(text, return_tensors='pt')
    input_ids = encoded_input['input_ids'][0]

    if len(input_ids) > max_length:
        parts = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
        sentiments = []
        for part in parts:
            part_encoded_input = {'input_ids': part.unsqueeze(0)}
            sentiment = compute_sentiment(part_encoded_input)
            sentiments.append(sentiment)

        sentiment = max(set(sentiments), key=sentiments.count)
        return sentiment
    else:
        return compute_sentiment(encoded_input)

In [None]:
GSHEET_KEY="1mp8m0ge_ETmeZCWz4IRQk8l4ZAZqKEgHDC_0_eYa-Ys"
sheet_name = "Sheet1"
url=f"https://docs.google.com/spreadsheet/ccc?key={GSHEET_KEY}&output=xlsx"
df = pd.read_excel(url,sheet_name=sheet_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23180 entries, 0 to 23179
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         23180 non-null  object 
 1   issue_number    23180 non-null  int64  
 2   issue_type      23180 non-null  object 
 3   text            23036 non-null  object 
 4   classification  23180 non-null  object 
 5   indicator       23180 non-null  object 
 6   zz_created      16933 non-null  object 
 7   zz_updated      16933 non-null  object 
 8   zz_resolved     16723 non-null  object 
 9   zz_duration     16723 non-null  float64
 10  zz_text         16787 non-null  object 
 11  zz_wink_a       23180 non-null  float64
 12  zz_wink_b       16788 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 2.3+ MB


In [None]:
df = df.dropna(subset=['zz_resolved'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16723 entries, 0 to 23179
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         16723 non-null  object 
 1   issue_number    16723 non-null  int64  
 2   issue_type      16723 non-null  object 
 3   text            16579 non-null  object 
 4   classification  16723 non-null  object 
 5   indicator       16723 non-null  object 
 6   zz_created      16723 non-null  object 
 7   zz_updated      16723 non-null  object 
 8   zz_resolved     16723 non-null  object 
 9   zz_duration     16723 non-null  float64
 10  zz_text         16577 non-null  object 
 11  zz_wink_a       16723 non-null  float64
 12  zz_wink_b       16578 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 1.8+ MB


In [None]:
df = df[df['classification'] != 'non_debt'].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   float64
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 281.8+ KB


In [None]:
df['roberta'] = df['text'].apply(lambda x: pd.Series(handle_long_text(x)))

In [None]:
df.head()

Unnamed: 0,project,issue_number,issue_type,text,classification,indicator,zz_created,zz_updated,zz_resolved,zz_duration,zz_text,zz_wink_a,zz_wink_b,roberta
0,camel,10153,description,Camel 2.17.x upgraded spring version to 4.x in...,architecture_debt,using_obsolete_technology,"Mon, 18 Jul 2016 06:14:14 +0000","Sat, 20 Aug 2016 13:44:18 +0000","Sat, 20 Aug 2016 13:44:17 +0000",2878203.0,Camel 2.17.x upgraded spring version to 4.x in...,-0.013222,-0.013222,neutral
1,camel,11734,description,It'd be nice if we could upgrade camel-grpc to...,architecture_debt,using_obsolete_technology,"Fri, 1 Sep 2017 06:17:00 +0000","Fri, 8 Sep 2017 08:49:52 +0000","Fri, 8 Sep 2017 08:49:52 +0000",613972.0,It'd be nice if we could upgrade camel-grpc to...,0.675,0.675,positive
2,camel,11868,description,The current java transport client is due EOL i...,architecture_debt,using_obsolete_technology,"Sat, 30 Sep 2017 05:29:19 +0000","Wed, 18 Oct 2017 06:54:13 +0000","Wed, 18 Oct 2017 06:54:13 +0000",1560294.0,The current java transport client is due EOL i...,0.30025,0.2402,neutral
3,camel,2535,description,As we don't use the CxfSoap component any more...,architecture_debt,using_obsolete_technology,"Wed, 10 Mar 2010 10:06:54 +0000","Sun, 24 Apr 2011 10:01:27 +0000","Thu, 11 Mar 2010 09:13:31 +0000",83197.0,As we don't use the CxfSoap component any more...,0.4,0.4,neutral
4,camel,2670,comment_1,"@ Charles I just checked out the example, I th...",architecture_debt,violation_of_modularity,"Fri, 23 Apr 2010 08:17:35 +0000","Mon, 26 Apr 2010 07:54:46 +0000","Fri, 23 Apr 2010 08:20:09 +0000",154.0,"@ Charles I just checked out the example, I th...",0.0,0.0,neutral


In [None]:
csv_file = "/content/drive/MyDrive/Uni stuff/Evidence-based/gerard/issues-v1/issue-v1-scored.csv"
df.to_csv(csv_file, index=False, header=True)

In [None]:
data = pd.read_csv(csv_file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   float64
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
 13  roberta         2773 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 303.4+ KB


#Bertweet

In [None]:
from google.colab import drive, userdata
from huggingface_hub import login

drive.mount('/content/drive')
# login(userdata.get('HF_TOKEN'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from openpyxl import load_workbook
from sklearn.metrics import confusion_matrix
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [None]:
# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis
model = f"finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model)
config = AutoConfig.from_pretrained(model)
r_model = AutoModelForSequenceClassification.from_pretrained(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [None]:
def compute_sentiment(encoded_input):
    try:
        output = r_model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        top_score = np.max(scores)
        sentiment = config.id2label[np.argmax(scores)]
        return sentiment
    except Exception as e:
        print(f"Error {e} processing encoded input: {encoded_input}")
        return None

def handle_long_text(text):
    max_length = 512
    encoded_input = tokenizer(text, return_tensors='pt')
    input_ids = encoded_input['input_ids'][0]

    if len(input_ids) > max_length:
        parts = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
        sentiments = []
        for part in parts:
            part_encoded_input = {'input_ids': part.unsqueeze(0)}
            sentiment = compute_sentiment(part_encoded_input)
            sentiments.append(sentiment)

        sentiment = max(set(sentiments), key=sentiments.count)
        return sentiment
    else:
        return compute_sentiment(encoded_input)

In [None]:
GSHEET_KEY = "1l7ydiWdw5AdyUVO8GLHLt0OIVjzspGeD8PZV8_gAqdQ"
sheet_name = "issue-v1-scored"  # Adjust to your desired sheet name
url = f"https://docs.google.com/spreadsheets/d/{GSHEET_KEY}/export?format=xlsx"
df = pd.read_excel(url,sheet_name=sheet_name)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   int64  
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
 13  roberta         2773 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 303.4+ KB


In [None]:
df = df.dropna(subset=['zz_resolved'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   int64  
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
 13  roberta         2773 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 303.4+ KB


In [None]:
df = df[df['classification'] != 'non_debt'].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   int64  
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
 13  roberta         2773 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 303.4+ KB


In [None]:
df['bert'] = df['text'].apply(lambda x: pd.Series(handle_long_text(x)))

In [None]:
df.head(1000)

Unnamed: 0,project,issue_number,issue_type,text,classification,indicator,zz_created,zz_updated,zz_resolved,zz_duration,zz_text,zz_wink_a,zz_wink_b,roberta,bert
0,thrift,18,description,"gcc 4.2 shows a huge amount of warnings ""warni...",code_debt,low_quality_code,"Wed, 21 May 2008 14:50:42 +0000","Tue, 1 Nov 2011 02:54:27 +0000","Tue, 27 May 2008 02:07:45 +0000",472623,"gcc 4.2 shows a huge amount of warnings ""warni...",-0.233333,-0.233333,negative,NEG
1,thrift,18,summary,warning: deprecated conversion from string con...,code_debt,low_quality_code,"Wed, 21 May 2008 14:50:42 +0000","Tue, 1 Nov 2011 02:54:27 +0000","Tue, 27 May 2008 02:07:45 +0000",472623,warning: deprecated conversion from string con...,-0.600000,-0.600000,negative,NEU
2,hbase,21,summary,hbase jar has hbase-default.xml at top-level r...,architecture_debt,violation_of_modularity,"Fri, 1 Feb 2008 04:44:28 +0000","Fri, 22 Aug 2008 21:13:04 +0000","Thu, 15 May 2008 19:29:22 +0000",9038694,hbase jar has hbase-default.xml at top-level r...,-0.050000,-0.050000,neutral,NEU
3,thrift,21,comment_2,I'm going to go ahead and say this is a non-is...,defect_debt,uncorrected_known_defects,"Mon, 26 May 2008 23:45:00 +0000","Thu, 26 Jun 2008 17:33:40 +0000","Thu, 26 Jun 2008 17:33:40 +0000",2656120,I'm going to go ahead and say this is a non-is...,0.243750,0.243750,neutral,NEU
4,thrift,21,comment_0,I Googled around for this a bit and found this...,requirement_debt,non-functional_requirements_not_fully_satisfied,"Mon, 26 May 2008 23:45:00 +0000","Thu, 26 Jun 2008 17:33:40 +0000","Thu, 26 Jun 2008 17:33:40 +0000",2656120,I Googled around for this a bit and found this...,0.200000,0.200000,negative,NEU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,thrift,3619,comment_3,This workaround will break gtest on OSX with c...,code_debt,low_quality_code,"Fri, 12 Feb 2016 18:54:01 +0000","Mon, 22 Feb 2016 13:17:21 +0000","Thu, 18 Feb 2016 22:04:34 +0000",529833,This workaround will break gtest on OSX with c...,0.375000,0.300000,negative,
996,thrift,3619,comment_4,"â€” yes, see what I had to do for Parquet: It ...",design_debt,non-optimal_design,"Fri, 12 Feb 2016 18:54:01 +0000","Mon, 22 Feb 2016 13:17:21 +0000","Thu, 18 Feb 2016 22:04:34 +0000",529833,"richardtsai — yes, see what I had to do for Pa...",0.085000,0.085000,neutral,NEG
997,thrift,3619,comment_1,"Ah, I misread Mark's comment, apologies. Let m...",documentation_debt,low_quality_documentation,"Fri, 12 Feb 2016 18:54:01 +0000","Mon, 22 Feb 2016 13:17:21 +0000","Thu, 18 Feb 2016 22:04:34 +0000",529833,"Ah, I misread Mark's comment, apologies. Let m...",-0.100000,-0.100000,neutral,NEU
998,hbase,3625,description,Currently the surefire plugin configuration de...,design_debt,non-optimal_design,"Fri, 11 Mar 2011 07:22:17 +0000","Fri, 20 Nov 2015 12:41:35 +0000","Tue, 15 Mar 2011 01:11:43 +0000",323366,Currently the surefire plugin configuration de...,-0.154167,-0.052778,negative,NEU


In [None]:
csv_file = "/content/drive/MyDrive/Evidence-based/gerard/issues-v1/issue-v2-scored.csv"
df.to_csv(csv_file, index=False, header=True)

In [None]:
data = pd.read_csv(csv_file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   project         2773 non-null   object 
 1   issue_number    2773 non-null   int64  
 2   issue_type      2773 non-null   object 
 3   text            2773 non-null   object 
 4   classification  2773 non-null   object 
 5   indicator       2773 non-null   object 
 6   zz_created      2773 non-null   object 
 7   zz_updated      2773 non-null   object 
 8   zz_resolved     2773 non-null   object 
 9   zz_duration     2773 non-null   int64  
 10  zz_text         2773 non-null   object 
 11  zz_wink_a       2773 non-null   float64
 12  zz_wink_b       2773 non-null   float64
 13  roberta         2773 non-null   object 
 14  bert            2342 non-null   object 
dtypes: float64(2), int64(2), object(11)
memory usage: 325.1+ KB
