# Train

In [1]:
import random
import torch
import numpy as np
import os
def set_seed(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
set_seed(2022)

In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

# Load SetFit model 
model = SetFitModel.from_pretrained('../input/trained-setfix-sector/trained_setfix')

# Infer

In [17]:
import pandas as pd
import numpy as np
client_news_df = pd.read_csv('/kaggle/input/dataton-2022/clientes_noticias.csv')[['nit', 'news_url_absolute','news_id']]
news_df = pd.read_csv('/kaggle/input/dataton-2022/noticias.csv')


client_news_df = client_news_df.merge(news_df, on='news_id')
client_news_df['text'] = client_news_df.news_title + '. ' + client_news_df.news_text_content
client_news_df['text'] = client_news_df['text'].apply(lambda x: x[:10000])
client_news_df.shape

(74709, 9)

In [18]:
# preds = model(client_news_df.text.values)
# client_news_df['preds'] = preds
preds_proba = model.predict_proba(client_news_df.text.values)

Batches:   0%|          | 0/2335 [00:00<?, ?it/s]

NameError: name 'labels' is not defined

In [20]:
df = pd.read_csv('../input/dataton-participacion-label/dataton2022-Latino_Asian_Brotherhood/code/matched_news_group.csv')
df.rename(columns={'group':'label_text'}, inplace=True)
df['text'] = df['news_title'] + '. ' + df['news_text_content']
labels = df.label_text.unique()

In [21]:
for i, l in enumerate(labels):
    client_news_df[l] = preds_proba[:,i]
client_news_df

Unnamed: 0,nit,news_url_absolute_x,news_id,news_url_absolute_y,news_init_date,news_final_date,news_title,news_text_content,text,preds,...,group_46,group_35,group_48,group_33,group_38,group_5,group_4,group_47,group_6,group_42
0,900378212,https://www.bluradio.com/economia/precio-dolar...,news10006,https://www.bluradio.com/economia/precio-dolar...,2022-07-30,2022-08-14,Precio dolar hoy: la cotizacion de la divisa a...,"Este martes, 2 de agosto, el dolar alcanzo un ...",Precio dolar hoy: la cotizacion de la divisa a...,27,...,0.005306,0.002634,0.003016,0.003350,0.003213,0.001485,0.004005,0.004109,0.005149,0.001707
1,900378212,https://www.semana.com/economia/macroeconomia/...,news10011,https://www.semana.com/economia/macroeconomia/...,2022-07-30,2022-08-14,Es cierto: El presidente Gustavo Petro quiere ...,No hay nada mas permanente que lo temporal y a...,Es cierto: El presidente Gustavo Petro quiere ...,5,...,0.008104,0.020259,0.007194,0.005707,0.004062,0.006475,0.014887,0.021432,0.010641,0.004992
2,860034313,https://www.semana.com/economia/macroeconomia/...,news10011,https://www.semana.com/economia/macroeconomia/...,2022-07-30,2022-08-14,Es cierto: El presidente Gustavo Petro quiere ...,No hay nada mas permanente que lo temporal y a...,Es cierto: El presidente Gustavo Petro quiere ...,5,...,0.008104,0.020259,0.007194,0.005707,0.004062,0.006475,0.014887,0.021432,0.010641,0.004992
3,900378212,https://elcomercio.pe/respuestas/que/gustavo-p...,news10015,https://elcomercio.pe/respuestas/que/gustavo-p...,2022-07-30,2022-08-14,Gustavo Petro: ?Que dice el informe final de s...,El equipo de empalme del gobierno del presiden...,Gustavo Petro: ?Que dice el informe final de s...,21,...,0.003066,0.001440,0.001580,0.002454,0.002170,0.000827,0.000921,0.001454,0.007176,0.000480
4,900166896,https://elcomercio.pe/respuestas/que/gustavo-p...,news10015,https://elcomercio.pe/respuestas/que/gustavo-p...,2022-07-30,2022-08-14,Gustavo Petro: ?Que dice el informe final de s...,El equipo de empalme del gobierno del presiden...,Gustavo Petro: ?Que dice el informe final de s...,21,...,0.003066,0.001440,0.001580,0.002454,0.002170,0.000827,0.000921,0.001454,0.007176,0.000480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74704,800230209,https://www.laopinion.com.co/economia/en-cucut...,news99997,https://www.laopinion.com.co/economia/en-cucut...,2022-07-15,2022-07-30,En Cucuta subio el desempleo en junio y fue de...,"Este viernes, el Departamento Administrativo N...",En Cucuta subio el desempleo en junio y fue de...,21,...,0.007174,0.012927,0.006034,0.008017,0.004098,0.006583,0.003958,0.010785,0.035789,0.003124
74705,890209174,https://www.laopinion.com.co/economia/en-cucut...,news99997,https://www.laopinion.com.co/economia/en-cucut...,2022-07-15,2022-07-30,En Cucuta subio el desempleo en junio y fue de...,"Este viernes, el Departamento Administrativo N...",En Cucuta subio el desempleo en junio y fue de...,21,...,0.007174,0.012927,0.006034,0.008017,0.004098,0.006583,0.003958,0.010785,0.035789,0.003124
74706,830069311,https://www.laopinion.com.co/economia/en-cucut...,news99997,https://www.laopinion.com.co/economia/en-cucut...,2022-07-15,2022-07-30,En Cucuta subio el desempleo en junio y fue de...,"Este viernes, el Departamento Administrativo N...",En Cucuta subio el desempleo en junio y fue de...,21,...,0.007174,0.012927,0.006034,0.008017,0.004098,0.006583,0.003958,0.010785,0.035789,0.003124
74707,830130106,https://www.laopinion.com.co/economia/en-cucut...,news99997,https://www.laopinion.com.co/economia/en-cucut...,2022-07-15,2022-07-30,En Cucuta subio el desempleo en junio y fue de...,"Este viernes, el Departamento Administrativo N...",En Cucuta subio el desempleo en junio y fue de...,21,...,0.007174,0.012927,0.006034,0.008017,0.004098,0.006583,0.003958,0.010785,0.035789,0.003124


In [22]:
client_news_df['preds'] = np.argmax(preds_proba,1)

labels_map_inverse = {}
for i, l in enumerate(labels):
    labels_map_inverse[i]=l
client_news_df['pred_group'] = client_news_df['preds'].replace(labels_map_inverse)

client_news_df.head()

Unnamed: 0,nit,news_url_absolute_x,news_id,news_url_absolute_y,news_init_date,news_final_date,news_title,news_text_content,text,preds,...,group_35,group_48,group_33,group_38,group_5,group_4,group_47,group_6,group_42,pred_group
0,900378212,https://www.bluradio.com/economia/precio-dolar...,news10006,https://www.bluradio.com/economia/precio-dolar...,2022-07-30,2022-08-14,Precio dolar hoy: la cotizacion de la divisa a...,"Este martes, 2 de agosto, el dolar alcanzo un ...",Precio dolar hoy: la cotizacion de la divisa a...,27,...,0.002634,0.003016,0.00335,0.003213,0.001485,0.004005,0.004109,0.005149,0.001707,group_21
1,900378212,https://www.semana.com/economia/macroeconomia/...,news10011,https://www.semana.com/economia/macroeconomia/...,2022-07-30,2022-08-14,Es cierto: El presidente Gustavo Petro quiere ...,No hay nada mas permanente que lo temporal y a...,Es cierto: El presidente Gustavo Petro quiere ...,5,...,0.020259,0.007194,0.005707,0.004062,0.006475,0.014887,0.021432,0.010641,0.004992,group_25
2,860034313,https://www.semana.com/economia/macroeconomia/...,news10011,https://www.semana.com/economia/macroeconomia/...,2022-07-30,2022-08-14,Es cierto: El presidente Gustavo Petro quiere ...,No hay nada mas permanente que lo temporal y a...,Es cierto: El presidente Gustavo Petro quiere ...,5,...,0.020259,0.007194,0.005707,0.004062,0.006475,0.014887,0.021432,0.010641,0.004992,group_25
3,900378212,https://elcomercio.pe/respuestas/que/gustavo-p...,news10015,https://elcomercio.pe/respuestas/que/gustavo-p...,2022-07-30,2022-08-14,Gustavo Petro: ?Que dice el informe final de s...,El equipo de empalme del gobierno del presiden...,Gustavo Petro: ?Que dice el informe final de s...,21,...,0.00144,0.00158,0.002454,0.00217,0.000827,0.000921,0.001454,0.007176,0.00048,group_36
4,900166896,https://elcomercio.pe/respuestas/que/gustavo-p...,news10015,https://elcomercio.pe/respuestas/que/gustavo-p...,2022-07-30,2022-08-14,Gustavo Petro: ?Que dice el informe final de s...,El equipo de empalme del gobierno del presiden...,Gustavo Petro: ?Que dice el informe final de s...,21,...,0.00144,0.00158,0.002454,0.00217,0.000827,0.000921,0.001454,0.007176,0.00048,group_36


In [23]:
client_news_df.to_csv('pred_news_group.csv', index=False)