In [3]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import nltk
#!pip install spacy
import spacy
#!pip install es_core_news_sm
import es_core_news_sm

nltk.download('punkt')
nltk.download('stopwords')
nlp = es_core_news_sm.load()

ValueError: 'in' is not a valid parameter name

In [None]:
# Cargar los datos
train_df = pd.read_csv('training.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
dev_df = pd.read_csv('development.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
test_df = pd.read_csv('test.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
train_df.head()

Diccionario de polaridad, lo introduciremos al pipeline como otra feature incluida en nuestro clasificador

In [None]:
def load_polarity_dict(file_path):
    polarity_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                word, polarity = parts
                try:
                    polarity_dict[word] = float(polarity)
                except ValueError:
                    continue  
    return polarity_dict

polarity_dict = load_polarity_dict("../corpus/ElhPolar_esV1.lex")

Preproceso de los texto en los que realizaremos las siguientes labores:
1. eliminamos las menciones
2. eliminamos caracteres especiales que no nos sirven
3. convertimos a minusculas, para mayor facilidad de encontrar la misma palabra ej: CASA Casa casa
4. Tokenizar los textos en palabras
5. Lemmatizar los textos con spaCy

In [None]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords, nlp):
        self.stopwords = stopwords
        self.nlp = nlp

    def preprocess(self, text):
        text = re.sub(r'@\w+', '', text)  # Eliminar menciones
        text = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ\s]', '', text)  # Eliminar caracteres especiales
        text = text.lower()  # Convertir a minúsculas
        doc = self.nlp(text)  # Procesar el texto con spaCy
        tokens = [token.lemma_ for token in doc if token.text not in self.stopwords and not token.is_punct and not token.is_stop]  # Lematización y eliminación de stopwords y signos de puntuación
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.preprocess)

Clase para extraer el score total de los textos

In [None]:
class PolarityScoreExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, polarity_dict):
        self.polarity_dict = polarity_dict

    def polarity_score(self, text):
        words = text.split()
        score = sum(self.polarity_dict.get(word, 0) for word in words)
        return score

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X.apply(self.polarity_score))


In [None]:
# Lista de stopwords en español
spanish_stopwords = set(stopwords.words('spanish'))
preprocessor = TextPreprocessor(spanish_stopwords, nlp)
train_df['tweet'] = preprocessor.transform(train_df['tweet'])
dev_df['tweet'] = preprocessor.transform(dev_df['tweet'])
test_df['tweet'] = preprocessor.transform(test_df['tweet'])

In [None]:
# Concat dev and train
combined_df = pd.concat([train_df, dev_df])
combined_df.info()

In [None]:
combined_df.head()

In [None]:
X_train = combined_df['tweet']
y_train = combined_df['label']
X_test = test_df['tweet']
X_train.count()

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
print(f'Cross-validation F1 scores: {cv_scores}')
print(f'Mean cross-validation F1 score: {cv_scores.mean()}')



In [None]:
# Predict test
y_pred_test = pipeline.predict(X_test)

# Results in df
results = pd.DataFrame({'id': test_df['id'], 'label': y_pred_test})

In [None]:
# Guardar el DataFrame en un archivo de texto con el formato requerido
results.to_csv('resultado2.txt', sep='\t', index=False, header=False)