In [1]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertConfig

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
custom_config = BertConfig(
    vocab_size=30522,               
    num_attention_heads=12,        
    num_hidden_layers=12,          
    attention_probs_dropout_prob=0.1,  
    hidden_size=768,               
    intermediate_size=3072,        
    hidden_dropout_prob=0.1,       
    hidden_act="relu",             
    max_position_embeddings=512    
)
bert_model = TFBertModel(custom_config)

data = pd.read_csv("C:\\Users\\lclai\\Desktop\\transcripts_cleaned.csv")
data = data[["label", "clean_transcripts"]]

data = data[(data['label'] == 0) | (data['label'] == 1)]

def preprocess_and_get_bert_embeddings(text, tokenizer, model, max_length=512):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        padding='max_length',  
        truncation=True,
        max_length=max_length
    )
    
    bert_outputs = model(**inputs)
    return bert_outputs.last_hidden_state, bert_outputs.pooler_output 

input_ids = []
attention_masks = []
last_hidden_states = []
pooler_outputs = []

for text in data['clean_transcripts']:
    last_hidden_state, pooler_output = preprocess_and_get_bert_embeddings(text, tokenizer, bert_model)
    last_hidden_states.append(last_hidden_state.numpy())  
    pooler_outputs.append(pooler_output.numpy())

last_hidden_states_tensor = tf.convert_to_tensor(last_hidden_states, dtype=tf.float32)
pooler_outputs_tensor = tf.convert_to_tensor(pooler_outputs, dtype=tf.float32)

print(f"Last Hidden States Tensor Shape: {last_hidden_states_tensor.shape}")
print(f"Pooler Outputs Tensor Shape: {pooler_outputs_tensor.shape}")


  from .autonotebook import tqdm as notebook_tqdm


Last Hidden States Tensor Shape: (496, 1, 512, 768)
Pooler Outputs Tensor Shape: (496, 1, 768)


In [2]:
class TextCNN(tf.keras.Model):
    def __init__(self, dropout_prob=0.5):
        super(TextCNN, self).__init__()
        
        # convolution kernels
        self.conv1 = tf.keras.layers.Conv2D(130, (5, 768), activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(130, (10, 768), activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(130, (15, 768), activation='relu')
        self.conv4 = tf.keras.layers.Conv2D(130, (20, 768), activation='relu')

        # max pooling
        self.pool = tf.keras.layers.GlobalMaxPooling2D()

        # fusion layer
        self.fc = tf.keras.layers.Dense(260, activation='relu')
        self.dropout = tf.keras.layers.Dropout(dropout_prob)

    def call(self, inputs):
        # input the output of the bert model
        x = tf.expand_dims(inputs, -1) 
    
        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)
        x4 = self.conv4(x)
        
        # pool the outputs of the convolution layers 
        pooled_1 = self.pool(x1)
        pooled_2 = self.pool(x2)
        pooled_3 = self.pool(x3)
        pooled_4 = self.pool(x4)
        
        # fusion of all the features
        fused_features = tf.concat([pooled_1, pooled_2, pooled_3, pooled_4], axis=-1)

        feature_vector = self.fc(fused_features)
        feature_vector = self.dropout(feature_vector)
        
        return feature_vector

textcnn_model = TextCNN()

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

X = last_hidden_states_tensor[:, 0, :].numpy()
y = data['label'].values

def obtain_features(sample):
    sample_with_batch_dim = tf.expand_dims(sample, axis=0)  # (1, 512, 768)
    sample_cnn = textcnn_model(sample_with_batch_dim)  # (1, features_cnn)
    return sample_cnn.numpy().squeeze()  

In [4]:
import numpy as np

X_features = []

for i in range(X.shape[0]):  
    X_features.append(obtain_features(X[i])) 

X_def = np.array(X_features)

In [5]:
X_def.shape

(496, 260)

In [6]:
data.shape

(496, 2)

In [7]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("C:\\Users\\lclai\\Desktop\\transcripts_cleaned.csv")
df = df[(df['label'] == 0) | (df['label'] == 1)]
df = df[["sex","age","label"]]
df["sex"] = df["sex"].map({"M": 0, "F": 1})  
X_numeric = df[["age", "sex"]].values  

X = np.hstack((X_numeric, X_def))
y = LabelEncoder().fit_transform(df["label"])  

In [18]:
p = pd.DataFrame(X)

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np

clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [500, 1000],  
    'max_depth': [10, 30, 50],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 5, 8],  
    'max_features': ['sqrt'],  
    'max_samples': [0.8, 0.9], 
   
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(p, y)

best_clf = grid_search.best_estimator_
y_pred_proba = best_clf.predict_proba(X)[:, 1]
auc_score = roc_auc_score(y, y_pred_proba)

print(f"Best parameters: {grid_search.best_params_}")
print(f"cross-validation AUC: {grid_search.best_score_:.4f}")


Best parameters: {'max_depth': 30, 'max_features': 'sqrt', 'max_samples': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}
Best cross-validation AUC: 0.7366


In [22]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.


In [23]:
# XGBOOST

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Definir el modelo XGBoost
clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [300, 500],  # Número de árboles
    'max_depth': [3, 5],  # Profundidad máxima del árbol
    'learning_rate': [0.01, 0.1, 0.3],  # Tasa de aprendizaje
    'subsample': [0.8, 1.0],  # Submuestreo de los datos
    'gamma': [0.1, 0.3]  # Poda para reducir sobreajuste
}

# Ajuste de hiperparámetros con validación cruzada
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X, y)

# Obtener el mejor modelo
best_clf = grid_search.best_estimator_

# Hacer predicciones con validación cruzada
y_pred = cross_val_predict(best_clf, X, y, cv=5)

# Calcular matriz de confusión
conf_matrix = confusion_matrix(y, y_pred)

print(conf_matrix)

# Reporte de clasificación
print(classification_report(y, y_pred))

# Mostrar los mejores hiperparámetros encontrados
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation AUC: {grid_search.best_score_:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[[169  74]
 [ 84 169]]
              precision    recall  f1-score   support

           0       0.67      0.70      0.68       243
           1       0.70      0.67      0.68       253

    accuracy                           0.68       496
   macro avg       0.68      0.68      0.68       496
weighted avg       0.68      0.68      0.68       496

Best parameters: {'gamma': 0.1, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.8}
Best cross-validation AUC: 0.7210
