In [1]:
!nvidia-smi

Wed Feb 26 00:05:56 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.46                 Driver Version: 546.80       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    On  | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P8               3W /  35W |    424MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [3]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [4]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [5]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()

In [6]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data=dpm.train_task1_df

data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


# Rebuild training set

In [7]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  orig_label =  data.loc[data.par_id == parid].orig_label.values[0]

  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      'orig_label':orig_label
  })

trdf = pd.DataFrame(rows)

# Save the dataframe
trdf.to_csv('train_set.csv', index=False)

trdf

Unnamed: 0,par_id,community,text,label,orig_label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1,4
1,4136,homeless,Durban 's homeless communities reconciliation ...,1,2
2,10352,poor-families,The next immediate problem that cropped up was...,1,4
3,8279,vulnerable,Far more important than the implications for t...,1,2
4,1164,poor-families,To strengthen child-sensitive social protectio...,1,4
...,...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0,0
8373,8383,hopeless,You have to see it from my perspective . I may...,0,0


# Rebuild Test Set

In [9]:
import random

rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  orig_label =  data.loc[data.par_id == parid].orig_label.values[0]

  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      'orig_label':orig_label
  })

tedf = pd.DataFrame(rows)

# Shuffle test set
indices = tedf.index.tolist()
random.shuffle(indices)
shuffled_tedf = tedf.loc[indices].reset_index(drop=True)

# Save the dataframe
tedf.to_csv('dev_set.csv', index=False)

tedf

Unnamed: 0,par_id,community,text,label,orig_label
0,4046,hopeless,We also know that they can benefit by receivin...,1,3
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1,4
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1,2
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1,4
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1,3
...,...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0,0
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0,0
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0,0
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0,1
