In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

import re
import random

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

seed = 42

random.seed(seed)
np.random.seed(seed)

In [None]:
data = pd.read_csv("transcription_gender.csv")

FEATURES = ["transcription", "gender"]
TARGETS = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'interview', 'openness']

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna().reset_index(drop=True)

In [None]:
data.describe()

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

data['transcription'] = data['transcription'].apply(clean_text)

In [None]:
data

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = np.hstack([tfidf_vectorizer.fit_transform(data['transcription']).toarray(), data[['gender']].values])

In [None]:
hf_embedder = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={'device': 'cuda:1'}
)

In [None]:
embeddings = pd.DataFrame(hf_embedder.embed_documents(data['transcription'].tolist()))
X_embeddings = pd.concat([embeddings, data[["gender"]]], axis=1)

In [None]:
y = data[TARGETS].values

In [None]:
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [500, 1000, 1500],
    'l2_leaf_reg': [1, 3, 5, 10],
}

def validate_catboost(X, y, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    model = CatBoostRegressor(loss_function='MultiRMSE', verbose=0, random_seed=seed)
    y_pred = cross_val_predict(model, X, y, cv=kf)
    mse = mean_squared_error(y, y_pred)
    return mse

def tune_catboost(X, y, param_grid, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    model = CatBoostRegressor(loss_function='MultiRMSE', verbose=0, random_seed=seed)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_

    model_grid = CatBoostRegressor(**best_params, loss_function='MultiRMSE', verbose=0, random_seed=seed)
    y_pred = cross_val_predict(model_grid, X, y, cv=kf)
    mse = mean_squared_error(y, y_pred)

    return model_grid, best_params, mse

## Catboost + tf-idf 10k

In [None]:
mse_catboost_tfidf = validate_catboost(X_tfidf, y)

print(mse_catboost_tfidf)

## Catboost + embeddings

In [None]:
mse_catboost_embeddings = validate_catboost(X_embeddings, y)

print(mse_catboost_embeddings)

## Catboost (tuned) + tf-idf 10k

In [None]:
model_grid_tfidf, best_params_grid_tfidf, mse_catboost_grid_tfidf = tune_catboost(X_tfidf, y, param_grid)

print(mse_catboost_grid_tfidf)

## Catboost (tuned) + embeddings

In [None]:
model_grid_embeddings, best_params_grid_embeddings, mse_catboost_grid_embeddings = tune_catboost(X_embeddings, y, param_grid)

print(mse_catboost_grid_embeddings)