### Cargar datos

In [None]:
import pandas as pd
datos = pd.read_csv('titulos_procesados.csv', encoding='ISO-8859-1', delimiter=',')
datos.head()

# Representación vectorial

#### Bolsa de palabras (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

titulox = datos["title"].tolist()
vectorizer = CountVectorizer(min_df=2)
matriz_bow = vectorizer.fit_transform(titulox)

bow = pd.DataFrame(matriz_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow.head()

#### Frecuencia de término – frecuencia inversa de documento (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(datos['title'])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(tfidf.head())

## Selección de variables

#### Seleccionar de acuerdo a la importancia de las variables según modelo Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Crear un modelo de clasificación de bosque aleatorio
model = RandomForestClassifier()
model.fit(bow, datos["classification"]) #tomando como referencia BoW

# Obtener la importancia de las características
importancia = model.feature_importances_

# Ordenar las características por su importancia
indices = np.argsort(importancia)[::-1][:4000]
X_selected = bow.iloc[:, indices] #toma las variables importantes

titulox = datos["title"]
vectorizer = CountVectorizer(vocabulary=X_selected)
matriz_bow = vectorizer.fit_transform(titulox)
bowrf = pd.DataFrame(matriz_bow.toarray(), columns=vectorizer.get_feature_names_out())
print(bowrf.head())

#### Principal component analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

pca = PCA()
pca.fit(bow) #tomando como referencia bag of words

# Varianza explicada acumulada
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Graficar la varianza explicada acumulada
plt.figure()
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Número de Componentes')
plt.ylabel('Varianza Explicada Acumulada')
plt.title('Varianza Explicada Acumulada por Número de Componentes')
plt.show()

In [None]:
# Aplicar PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3000)  # Número de componentes principales a retener
X_pca = pca.fit_transform(bow)
principal_components = pd.DataFrame(data=X_pca)
final_df = pd.concat([principal_components, pd.Series(datos["classification"], name='target')], axis=1)

# Visualizar los resultados
print(final_df.head())

### Dividir datos en entrenamiento y prueba

In [None]:
from sklearn.model_selection import train_test_split

info = bow #Si pruebo BoW
#info = tfidf #Si pruebo TF-IDF
#info = principal_components #Si pruebo PCA
#info = bowrf #Si pruebo importancia con RF

X_train, X_test, y_train, y_test = train_test_split(info, datos["classification"], test_size = 0.1, random_state=70)

# Modelos utilizados

### Búsqueda de hiperparámetros

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

modelo_rf = RandomForestClassifier(n_estimators=100, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=modelo_rf, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)
mejores_hiperparametros = grid_search.best_params_
print(mejores_hiperparametros)

In [None]:
# Regresión logística

from sklearn.linear_model import LogisticRegression
modelorl = LogisticRegression(multi_class='multinomial', C= 0.1, max_iter= 100, penalty= 'l2', solver= 'lbfgs')

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  
    'penalty': ['l1', 'l2'],  
    'solver': ['lbfgs', 'liblinear'],  
    'max_iter': [100, 500, 1000]  
}

grid_search = GridSearchCV(estimator=modelorl, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

mejores_hiperparametros = grid_search.best_params_
print(mejores_hiperparametros)

In [None]:
# Support vector machine

from sklearn.svm import SVC
modelo_svm = SVC(decision_function_shape='ovr')

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(estimator=modelo_svm, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

mejores_hiperparametros = grid_search.best_params_
print(mejores_hiperparametros)

In [None]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}

grid_search = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X2_train, y2_train)
mejor_alpha = grid_search.best_params_['alpha']
print(mejor_alpha)

### Medir el desempeño de los modelos 

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

modelo_rf = RandomForestClassifier(max_depth= None, min_samples_split= 5, n_estimators= 200)
modelo_rf.fit(X_train, y_train)

y_pred = modelo_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Regresión logística multinomial

from sklearn.linear_model import LogisticRegression

modelrl = LogisticRegression(C= 0.1, max_iter= 100, penalty= 'l2', solver= 'lbfgs', multi_class='multinomial')
modelrl.fit(X_train, y_train)
y_pred = modelrl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Regresión logística ovr

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

modelrl = LogisticRegression(C= 0.1, max_iter= 100, penalty= 'l2', solver= 'lbfgs', multi_class='ovr')
modelrl.fit(X_train, y_train)
y_pred = modelrl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Support vector machine (kernel lineal)

from sklearn.svm import SVC

modelo_svm = SVC(kernel='linear', C=0.1, decision_function_shape='ovr') 
modelo_svm.fit(X_train, y_train)
y_pred = modelo_svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Support vector machine (kernel polinómico)
modelo_svm = SVC(kernel='poly', C=0.1, decision_function_shape='ovr') 
modelo_svm.fit(X_train, y_train)
y_pred = modelo_svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Support vector machine (kernel radial)
modelo_svm = SVC(kernel='rbf', C=0.1, decision_function_shape='ovr') 
modelo_svm.fit(X_train, y_train)
y_pred = modelo_svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

modelo_nb = MultinomialNB()
modelo_nb.fit(X_train, y_train)
y_pred = modelo_nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)