In [1]:
! pip install emoji deep-translator



In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoConfig
from Preprocessing import Preprocessing
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

In [33]:
# Check if GPU is available
print("Is CUDA available?", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

Is CUDA available? True
Device name: Tesla T4


In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [35]:
# Ruta al dataset
path_df = './dataset_second_classifier.csv'

df = pd.read_csv(path_df, sep = ',', header = 0)
df.head(10)

Unnamed: 0,age,bio,job,school,anthem_title,anthem_artist,gender,descriptor_text,interest_text,total_photos,desition,longitud,latitud,hash_key,descriptor,interest,city_hashed
0,23.0,Do I want it or do I want techno?|The one with...,['NotSpecified'],,Mood Swings (feat. Lil Tjay),Pop Smoke; Lil Tjay,Woman,|Pisces|Better in person|Beer,,15,2,-3.684574,40.415211,-3.38878e+17,"['Pisces', 'Better in person', 'Beer']",[],500a2b51765bf57441a2a5ba95621dcccc92722c51d321...
1,27.0,with something to do tomorrow.|,['NotSpecified'],,,,Woman,,,14,2,-3.684574,40.415211,6.305302e+18,[],[],4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...
2,20.0,a few joints and a beer or are we going to a r...,['ketalal'],,Visiones de la Muerte,Fertil Miseria,Woman,|Scorpio|Pet-free|Smoker|Beer,|Coffee|Writer|Craft Beer|Wine|Extrovert,8,2,-3.684574,40.415211,-6.136008e+18,"['Scorpio', 'Pet-free', 'Smoker', 'Beer']","['Coffee', 'Writer', 'Craft Beer', 'Wine', 'Ex...",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...
3,22.0,nice but clumsy.|funny but stubborn.|clown but...,['NotSpecified'],,,,Woman,|Taurus|Beer,|Music|Camping|Travel|Grab a drink,8,2,-3.684574,40.415211,5.360212e+18,"['Taurus', 'Beer']","['Music', 'Camping', 'Travel', 'Grab a drink']",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...
4,26.0,something different!,['NotSpecified'],Uem,,,Woman,,|Fashion|Festivals|Travel|Esports,1,2,-3.684574,40.415211,-4.665239e+17,[],"['Fashion', 'Festivals', 'Travel', 'Esports']",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...
5,21.0,||:woman_police_officer_light_skin_tone::blue_...,['NotSpecified'],,,,Woman,|Socially active|Virgo|Wine|Dog|Smoker,|Wine|Foodie|Dog lover,15,2,-3.684574,40.415211,5.196545e+18,"['Socially active', 'Virgo', 'Wine', 'Dog', 'S...","['Wine', 'Foodie', 'Dog lover']",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...
6,18.0,"If you are a fascist, go back to where you cam...",['NotSpecified'],,My Own Summer (Shove It),Deftones,Woman,|Aquarius|B time texter|Occasionally|All the d...,|Festivals|Tattoos|Spirituality|Activism|Music,11,2,-3.684574,40.415211,3.261688e+18,"['Aquarius', 'B time texter', 'Occasionally', ...","['Festivals', 'Tattoos', 'Spirituality', 'Acti...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...
7,18.0,::anatomical_heart:|:Venezuela:,['NotSpecified'],,,,Woman,|Virgo|Gym rat,|Festivals|Instagram|Travel|World Traveler|Sho...,5,2,-3.684574,40.415211,4.220185e+18,"['Virgo', 'Gym rat']","['Festivals', 'Instagram', 'Travel', 'World Tr...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...
8,34.0,trans girl :transgender_flag: |if you like it;...,['NotSpecified'],,Luces de Neón,Varry Brava,Woman,|Capricorn|Better in person|Occasionally|Non-s...,|Reading|Travel|Movies|Disney|Netflix,8,2,-3.684574,40.415211,7.696506e+18,"['Capricorn', 'Better in person', 'Occasionall...","['Reading', 'Travel', 'Movies', 'Disney', 'Net...",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...
9,23.0,"fun girl; sincere, passionate about music | co...",['cocinera at restaurante'],Universidad Autónoma de Madrid,The Nhts,Avicii,Woman,|Aries|B time texter|Occasionally|Non-smoker|S...,|Foodie|Photography|Musician|Tattoos|Cooking,11,2,-3.684574,40.415211,-7.564959e+18,"['Aries', 'B time texter', 'Occasionally', 'No...","['Foodie', 'Photography', 'Musician', 'Tattoos...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...


In [36]:
# Crear la columna 'gender-specified'
df['gender-specified'] = df['gender'].apply(lambda x: 1 if x == 'NotSpecified' else 0)

In [37]:
preprocessing = Preprocessing()
df['bio'] = df['bio'].fillna('').apply(preprocessing.remove_vertical_bars)
df['interest_text'] = df['interest_text'].fillna('').apply(preprocessing.remove_vertical_bars).apply(preprocessing.remove_numbers)

In [38]:
df.head(10)

Unnamed: 0,age,bio,job,school,anthem_title,anthem_artist,gender,descriptor_text,interest_text,total_photos,desition,longitud,latitud,hash_key,descriptor,interest,city_hashed,gender-specified
0,23.0,Do I want it or do I want techno? The one with...,['NotSpecified'],,Mood Swings (feat. Lil Tjay),Pop Smoke; Lil Tjay,Woman,|Pisces|Better in person|Beer,,15,2,-3.684574,40.415211,-3.38878e+17,"['Pisces', 'Better in person', 'Beer']",[],500a2b51765bf57441a2a5ba95621dcccc92722c51d321...,0
1,27.0,with something to do tomorrow.,['NotSpecified'],,,,Woman,,,14,2,-3.684574,40.415211,6.305302e+18,[],[],4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...,0
2,20.0,a few joints and a beer or are we going to a r...,['ketalal'],,Visiones de la Muerte,Fertil Miseria,Woman,|Scorpio|Pet-free|Smoker|Beer,Coffee Writer Craft Beer Wine Extrovert,8,2,-3.684574,40.415211,-6.136008e+18,"['Scorpio', 'Pet-free', 'Smoker', 'Beer']","['Coffee', 'Writer', 'Craft Beer', 'Wine', 'Ex...",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...,0
3,22.0,nice but clumsy. funny but stubborn. clown but...,['NotSpecified'],,,,Woman,|Taurus|Beer,Music Camping Travel Grab a drink,8,2,-3.684574,40.415211,5.360212e+18,"['Taurus', 'Beer']","['Music', 'Camping', 'Travel', 'Grab a drink']",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...,0
4,26.0,something different!,['NotSpecified'],Uem,,,Woman,,Fashion Festivals Travel Esports,1,2,-3.684574,40.415211,-4.665239e+17,[],"['Fashion', 'Festivals', 'Travel', 'Esports']",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...,0
5,21.0,:woman_police_officer_light_skin_tone::blue_...,['NotSpecified'],,,,Woman,|Socially active|Virgo|Wine|Dog|Smoker,Wine Foodie Dog lover,15,2,-3.684574,40.415211,5.196545e+18,"['Socially active', 'Virgo', 'Wine', 'Dog', 'S...","['Wine', 'Foodie', 'Dog lover']",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...,0
6,18.0,"If you are a fascist, go back to where you cam...",['NotSpecified'],,My Own Summer (Shove It),Deftones,Woman,|Aquarius|B time texter|Occasionally|All the d...,Festivals Tattoos Spirituality Activism Music,11,2,-3.684574,40.415211,3.261688e+18,"['Aquarius', 'B time texter', 'Occasionally', ...","['Festivals', 'Tattoos', 'Spirituality', 'Acti...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...,0
7,18.0,::anatomical_heart: :Venezuela:,['NotSpecified'],,,,Woman,|Virgo|Gym rat,Festivals Instagram Travel World Traveler Sho...,5,2,-3.684574,40.415211,4.220185e+18,"['Virgo', 'Gym rat']","['Festivals', 'Instagram', 'Travel', 'World Tr...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...,0
8,34.0,trans girl :transgender_flag: if you like it;...,['NotSpecified'],,Luces de Neón,Varry Brava,Woman,|Capricorn|Better in person|Occasionally|Non-s...,Reading Travel Movies Disney Netflix,8,2,-3.684574,40.415211,7.696506e+18,"['Capricorn', 'Better in person', 'Occasionall...","['Reading', 'Travel', 'Movies', 'Disney', 'Net...",4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba...,0
9,23.0,"fun girl; sincere, passionate about music co...",['cocinera at restaurante'],Universidad Autónoma de Madrid,The Nhts,Avicii,Woman,|Aries|B time texter|Occasionally|Non-smoker|S...,Foodie Photography Musician Tattoos Cooking,11,2,-3.684574,40.415211,-7.564959e+18,"['Aries', 'B time texter', 'Occasionally', 'No...","['Foodie', 'Photography', 'Musician', 'Tattoos...",500a2b51765bf57441a2a5ba95621dcccc92722c51d321...,0


In [39]:
# Entrada y etiquetas
X = df['bio'] + " " + df['interest_text']
y = df['gender-specified']

In [40]:
# Dividir en conjuntos de entrenamiento y prueba
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [41]:


# Cargar el tokenizer y el modelo
configuration = AutoConfig.from_pretrained('distilbert-base-uncased')
configuration.hidden_dropout_prob = 0.5
configuration.attention_probs_dropout_prob = 0.5
configuration.num_labels = 2

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config = configuration)

# Tokenización
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=512)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Check if GPU is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the GPU

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [43]:
# Define a custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return the input ids, attention mask, and labels for a specific index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [44]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  # Get the predicted class labels
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')  # Binary classification
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [45]:
# Tokenizar los conjuntos
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Create datasets
train_dataset = CustomDataset(train_encodings, y_train.values)
val_dataset = CustomDataset(val_encodings, y_val.values)
test_dataset = CustomDataset(test_encodings, y_test.values)

In [46]:
import accelerate

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,           # Lower learning rate for smoother convergence
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,          # Increased number of epochs
    weight_decay=0.01,
    logging_dir='./logs',         # To save logs for later analysis
    logging_steps=50,             # Log every 50 steps to track training loss more frequently
    save_steps=500,               # Save checkpoints every 500 steps
    load_best_model_at_end=True,  # Load the best model after training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Entrenar el modelo
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4673,0.426704,0.848485,0.835,0.860825,0.847716
2,0.3956,0.395246,0.848485,0.831683,0.865979,0.848485
3,0.329,0.371059,0.863636,0.839806,0.891753,0.865
4,0.28,0.397977,0.866162,0.837321,0.902062,0.868486
5,0.2286,0.423424,0.838384,0.828283,0.845361,0.836735
6,0.2103,0.490208,0.818182,0.835165,0.783505,0.808511
7,0.1843,0.541459,0.825758,0.810945,0.840206,0.825316
8,0.1757,0.604143,0.810606,0.821622,0.783505,0.802111
9,0.1228,0.632281,0.818182,0.805,0.829897,0.817259
10,0.1291,0.64919,0.813131,0.8125,0.804124,0.80829


TrainOutput(global_step=1980, training_loss=0.25619987694904056, metrics={'train_runtime': 934.2366, 'train_samples_per_second': 33.867, 'train_steps_per_second': 2.119, 'total_flos': 2365774598856480.0, 'train_loss': 0.25619987694904056, 'epoch': 10.0})

In [47]:
# Evaluar el modelo
results = trainer.evaluate()
print(results)

{'eval_loss': 0.3710591495037079, 'eval_accuracy': 0.8636363636363636, 'eval_precision': 0.8398058252427184, 'eval_recall': 0.8917525773195877, 'eval_f1': 0.865, 'eval_runtime': 2.3325, 'eval_samples_per_second': 169.775, 'eval_steps_per_second': 10.718, 'epoch': 10.0}


In [48]:
# Predict on the validation set
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(axis=-1)

# Print classification report
print(classification_report(y_test, predicted_labels))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       202
           1       0.82      0.93      0.87       194

    accuracy                           0.87       396
   macro avg       0.87      0.87      0.87       396
weighted avg       0.87      0.87      0.87       396

