# Train

In [38]:
import random
import torch
import numpy as np
import os
def set_seed(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
set_seed(2022)

In [1]:
import pandas as pd
df = pd.read_csv('../input/dataton-participacion-label/dataton2022-Latino_Asian_Brotherhood/code/matched_news_group.csv')
df.rename(columns={'group':'label_text'}, inplace=True)
df['text'] = df['news_title'] + '. ' + df['news_text_content']
labels = df.label_text.unique()

labels_map = {}
for i, l in enumerate(labels):
    labels_map[l]=i

df['label'] = df.label_text.replace(labels_map)

df = df.groupby('label_text').apply(lambda x: x.sample(16,replace=False) if x.news_id.count()>=16 else x).reset_index(drop=True)
df.to_csv('AnnotatedFewShotDataset.csv', index=False)

In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
dataset = load_dataset("csv", data_files="AnnotatedFewShotDataset.csv")
train_ds = dataset["train"]

# Load SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
#     eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=8,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=1 # Number of epochs to use for contrastive learning
)
# Train and evaluate!
trainer.train()
# metrics = trainer.evaluate()

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6474caa8cee2db40/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6474caa8cee2db40/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 18560
  Num epochs = 1
  Total optimization steps = 2320
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2320 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

## Evaluate the models

In [6]:
# On testing data
test_df = pd.read_csv('../input/dataton-participacion-label/dataton2022-Latino_Asian_Brotherhood/code/matched_news_group.csv')
test_df.rename(columns={'group':'label_text'}, inplace=True)
test_df['text'] = test_df['news_title'] + '. ' + test_df['news_text_content']
labels = test_df.label_text.unique()

labels_map = {}
for i, l in enumerate(labels):
    labels_map[l]=i

test_df['label'] = test_df.label_text.replace(labels_map)
test_df = test_df[~test_df.news_id.isin(df.news_id)]

preds = model(test_df.text.values)
test_df['preds'] = preds
preds_proba = model.predict_proba(test_df.text.values)
for i, l in enumerate(labels):
    test_df[l] = preds_proba[:,i]
test_df

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

Unnamed: 0,nit,news_url_absolute_x,news_id,news_url_absolute_y,news_init_date,news_final_date,news_title,news_text_content,nombre,desc_ciiu_division,...,desc_ciiu_division_eng,trimmed_name,appearance_in_title,appearance_in_body,name_in_title,name_in_body,count,text,label,preds
0,860034313,https://www.semana.com/mejor-colombia/articulo...,news10333,https://www.semana.com/mejor-colombia/articulo...,2022-07-30,2022-08-14,Un seguro contra el cambio climatico. Asi se b...,Uno de los grandes desafios del sector agricol...,BANCO DAVIVIENDA SA,"ACTIVIDADES DE SERVICIOS FINANCIEROS, EXCEPTO ...",...,"FINANCIAL SERVICES ACTIVITIES, EXCEPT INSURANC...",BANCO DAVIVIENDA,[],"['ilidad a los colombianos "", afirma Alvaro Ca...",False,True,1,Un seguro contra el cambio climatico. Asi se b...,0,15
1,860034313,https://www.semana.com/economia/capsulas/artic...,news10341,https://www.semana.com/economia/capsulas/artic...,2022-07-30,2022-08-14,Banco Davivienda aumento su inversion indirect...,Copyright (c) 2022 Publicaciones Semana S.A NI...,BANCO DAVIVIENDA SA,"ACTIVIDADES DE SERVICIOS FINANCIEROS, EXCEPTO ...",...,"FINANCIAL SERVICES ACTIVITIES, EXCEPT INSURANC...",BANCO DAVIVIENDA,[' Banco Davivienda aumento su inversion in...,[],True,False,1,Banco Davivienda aumento su inversion indirect...,0,22
2,860034313,https://www.iproup.com/finanzas/32808-rappi-pe...,news10348,https://www.iproup.com/finanzas/32808-rappi-pe...,2022-07-15,2022-07-30,"Rappi, PedidosYa: que servicios financieros of...","La competencia por convertirse en la nueva ""su...",BANCO DAVIVIENDA SA,"ACTIVIDADES DE SERVICIOS FINANCIEROS, EXCEPTO ...",...,"FINANCIAL SERVICES ACTIVITIES, EXCEPT INSURANC...",BANCO DAVIVIENDA,[],['esta disponible en los siguientes mercados :...,False,True,1,"Rappi, PedidosYa: que servicios financieros of...",0,3
3,860034313,https://www.laopinion.com.co/economia/los-banc...,news10351,https://www.laopinion.com.co/economia/los-banc...,2022-07-15,2022-07-30,Los bancos que le permiten abrir cuentas en do...,En medio de la coyuntura por el alza del dolar...,BANCO DAVIVIENDA SA,"ACTIVIDADES DE SERVICIOS FINANCIEROS, EXCEPTO ...",...,"FINANCIAL SERVICES ACTIVITIES, EXCEPT INSURANC...",BANCO DAVIVIENDA,[],['l exterior para hacerlo. Algunas de las enti...,False,True,1,Los bancos que le permiten abrir cuentas en do...,0,0
4,860034313,https://www.elheraldo.co/judicial/policia-frus...,news10354,https://www.elheraldo.co/judicial/policia-frus...,2022-07-15,2022-07-30,Policia frustra 'taquillazo' en entidad bancar...,Dos delincuentes fueron capturados por la Poli...,BANCO DAVIVIENDA SA,"ACTIVIDADES DE SERVICIOS FINANCIEROS, EXCEPTO ...",...,"FINANCIAL SERVICES ACTIVITIES, EXCEPT INSURANC...",BANCO DAVIVIENDA,[],"[""turados por la Policia, en una rapida reacci...",False,True,1,Policia frustra 'taquillazo' en entidad bancar...,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,813011577,https://www.alertatolima.com/noticias/huila/jo...,news37126,https://www.alertatolima.com/noticias/huila/jo...,2022-07-15,2022-07-30,Joven murio al tratar de esquivar un hueco en ...,El siniestro ocurrio en el sector de la carrer...,CLINICA UROS S.A.S,ACTIVIDADES DE ATENCION DE LA SALUD HUMANA,...,HUMAN HEALTH CARE ACTIVITIES,CLINICA UROS,[],['aramedicos que le prestaron los primeros aux...,False,True,1,Joven murio al tratar de esquivar un hueco en ...,20,18
2934,890212568,https://qhubobucaramanga.com/deportes/81537-de...,news25578,https://qhubobucaramanga.com/deportes/81537-de...,2022-07-30,2022-08-14,Desde este sabado comienza la preparacion para...,Desde este sabado se podra realizar la Carrera...,FUNDACION CARDIOVASCULAR DE COLOMBIA FCV,ACTIVIDADES DE ATENCION DE LA SALUD HUMANA,...,HUMAN HEALTH CARE ACTIVITIES,FUNDACION CARDIOVASCULAR COLOMBIA FCV,[],['Desde este sabado se podra realizar la Carre...,False,True,1,Desde este sabado comienza la preparacion para...,20,13
2935,890212568,https://abyayala.tv.bo/una-doctora-colombiana-...,news25593,https://abyayala.tv.bo/una-doctora-colombiana-...,2022-07-30,2022-08-14,Una doctora colombiana es la primera mujer en ...,Una doctora colombiana se convirtio en la prim...,FUNDACION CARDIOVASCULAR DE COLOMBIA FCV,ACTIVIDADES DE ATENCION DE LA SALUD HUMANA,...,HUMAN HEALTH CARE ACTIVITIES,FUNDACION CARDIOVASCULAR COLOMBIA FCV,[],"['Mendoza Crespo, quien se desempena como jefa...",False,True,1,Una doctora colombiana es la primera mujer en ...,20,13
2937,890212568,https://consultorsalud.com/fcv-certificacion-e...,news25630,https://consultorsalud.com/fcv-certificacion-e...,2022-07-15,2022-07-30,FCV recibe certificacion EMRAM 7 en salud digital,"En dias anteriores, durante el Himss Executive...",FUNDACION CARDIOVASCULAR DE COLOMBIA FCV,ACTIVIDADES DE ATENCION DE LA SALUD HUMANA,...,HUMAN HEALTH CARE ACTIVITIES,FUNDACION CARDIOVASCULAR COLOMBIA FCV,[],"['En dias anteriores, durante el Himss Executi...",False,True,1,FCV recibe certificacion EMRAM 7 en salud digi...,20,13


array([ 0,  0,  0, ...,  0, 20, 11])

In [25]:
np.mean(preds[np.max(preds_proba, 1) > 0.7] == test_df.label.values[np.max(preds_proba, 1) > 0.7])

0.6630541871921182

In [16]:
#Accuracy test
import numpy as np
np.mean(test_df.preds == test_df.label)

0.4124293785310734

In [32]:
#Accuracy with threshold
thr = 0.7
np.mean(preds[np.max(preds_proba, 1) > thr] == test_df.label.values[np.max(preds_proba, 1) > thr])

0.6630541871921182

In [36]:
np.sum(np.max(preds_proba, 1) > 0.7)

1015

# Infer

In [49]:
import pandas as pd
client_news_df = pd.read_csv('/kaggle/input/dataton-2022/clientes_noticias.csv')[['nit', 'news_url_absolute','news_id']]
news_df = pd.read_csv('/kaggle/input/dataton-2022/noticias.csv')


client_news_df = client_news_df.merge(news_df, on='news_id')
client_news_df['text'] = client_news_df.news_title + '. ' + client_news_df.news_text_content

In [51]:
client_news_df.shape

(74709, 9)

In [53]:
!mkdir -p /kaggle/working/trained_setfix

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
model.save_pretrained('/kaggle/working/trained_setfix')

In [44]:
# preds = model(client_news_df.text.values)
# client_news_df['preds'] = preds
preds_proba = model.predict_proba(client_news_df.text.values)
for i, l in enumerate(labels):
    client_news_df[l] = preds_proba[:,i]
client_news_df

Batches:   0%|          | 0/2335 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
labels_map_inverse = {}
for i, l in enumerate(labels):
    labels_map_inverse[i]=l
news_df['pred_group'] = news_df['preds'].replace(labels_map_inverse)

news_df.head()

In [None]:
news_df.to_csv('pred_des_ciiu_division.csv', index=False)