In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline

In [2]:
train_data = pd.read_csv('./dataTraining.csv')
test_data = pd.read_csv('./dataTesting.csv')

In [3]:
train_data_f = train_data.drop(columns=['Unnamed: 0', 'rating', 'genres'])
test_data['longitud_trama'] = test_data['plot'].str.len()
train_data_f['generos'] = train_data['genres'].str.strip('[]').str.replace("'","").str.replace(" ","").str.split(',')

In [4]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from nltk.corpus import stopwords

In [5]:
nltk.download('stopwords')

stopwords_es = list(stopwords.words('spanish'))
stopwords_en = list(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     G15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
train_data_f['texto_completo'] = train_data_f['title'] + " " + train_data_f['plot']

vector_count = CountVectorizer(
    lowercase=True,
    stop_words=stopwords_es + stopwords_en
)

X_count = vector_count.fit_transform(train_data_f['texto_completo'])

print(X_count.shape)

(7895, 39455)


In [7]:
vector_tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words=stopwords_es + stopwords_en
)

X_tfidf = vector_tfidf.fit_transform(train_data_f['texto_completo'])

print(X_tfidf.shape)

(7895, 39455)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
import ast

In [9]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data_f['generos'])

In [10]:
#entrenar_evaluar_cv(X, y_train, nombre):
modelo_final = OneVsRestClassifier(LogisticRegression(max_iter=500, random_state=42))
modelo_final.fit(X_tfidf, y_train)

test_data['texto_completo'] = test_data['title'] + " " + test_data['plot']
X_test_tfidf = vector_tfidf.transform(test_data['texto_completo'])

y_pred_test = modelo_final.predict_proba(X_test_tfidf)

In [11]:
import joblib
joblib.dump(modelo_final, 'TF-IDF.pkl')
print("Modelo guardado como 'TF-IDF.pkl'")

joblib.dump(vector_tfidf, 'vector_tfidf.pkl')
print("Vectorizador TF-IDF guardado como 'vector_tfidf.pkl'")

joblib.dump(mlb, 'mlb.pkl')
print("MultiLabelBinarizer guardado como 'mlb.pkl'")


Modelo guardado como 'TF-IDF.pkl'
Vectorizador TF-IDF guardado como 'vector_tfidf.pkl'
MultiLabelBinarizer guardado como 'mlb.pkl'


In [12]:
loaded_model = joblib.load('TF-IDF.pkl')

In [13]:
texto_ejemplo_en = "A young detective investigates a series of mysterious murders in a small town. As he digs deeper, he discovers supernatural elements and a conspiracy that threatens the entire world."
df_ejemplo = pd.DataFrame({'texto_completo': [texto_ejemplo_en]})
X_ejemplo = vector_tfidf.transform(df_ejemplo['texto_completo'])
y_pred_proba = loaded_model.predict_proba(X_ejemplo)
top_n = 3
top_indices = y_pred_proba[0].argsort()[-top_n:][::-1]
top_probs = y_pred_proba[0][top_indices]

In [14]:
print("Géneros predichos para el texto:")
for i, (idx, prob) in enumerate(zip(top_indices, top_probs)):
    genero = mlb.classes_[idx]
    print(f"{i+1}. {genero}: {prob*100:}%")

Géneros predichos para el texto:
1. Thriller: 68.78736094414201%
2. Mystery: 51.901778977591306%
3. Drama: 40.73851878797909%
