In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [5]:
import re
import string

In [6]:
def limpia_texto(texto):
    texto = texto.lower()
    texto = re.sub(r'\d+', '', texto) # números
    texto = texto.translate(str.maketrans('', '', string.punctuation))  # signos de puntuación
    texto = re.sub(r'\s+', ' ', texto).strip() # espacios
    return ' '.join([p for p in texto.split() if p.isalpha()])

In [7]:
df = pd.read_csv("./dataset/df_completo.csv")

In [8]:

df['texto_limpio'] = df["text"].apply(limpia_texto)

In [9]:
df['texto_limpio']

0        donald trump just couldn t wish all americans ...
1        house intelligence committee chairman devin nu...
2        on friday it was revealed that former milwauke...
3        on christmas day donald trump announced that h...
4        pope francis used his annual christmas day mes...
                               ...                        
44674    brussels reuters nato allies on tuesday welcom...
44675    london reuters lexisnexis a provider of legal ...
44676    minsk reuters in the shadow of disused soviete...
44677    moscow reuters vatican secretary of state card...
44678    jakarta reuters indonesia will buy sukhoi figh...
Name: texto_limpio, Length: 44679, dtype: object

In [10]:
def random_forest():
    model = RandomForestClassifier(random_state=42)
    return model

In [11]:
def svc():
    model = SVC(random_state=42)
    return model

In [12]:
def logistic():
    model = LogisticRegression(random_state=42)
    return model

In [13]:
modelos = {
    "support_vector_classification": svc(),
    "random_forest_classifier": random_forest(),
    "logistic_regression": logistic(),
}

In [14]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['texto_limpio'])
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.3, random_state=42)

In [15]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
for nombre, modelo in modelos.items():
    cv_scores = cross_val_score(modelo, X_test, y_test, cv=cv, scoring='f1')
    print(f"{nombre}: Accuracy = {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

support_vector_classification: Accuracy = 0.9851 ± 0.0021
random_forest_classifier: Accuracy = 0.9782 ± 0.0026
logistic_regression: Accuracy = 0.9785 ± 0.0010


In [18]:
path = "./dataset/"
df.to_csv(path + "dataset_limpio.csv", index=False)