In [2]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/josecarlosavilapalazon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josecarlosavilapalazon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Cargar los datos
train_df = pd.read_csv('training.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
dev_df = pd.read_csv('development.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
test_df = pd.read_csv('test.txt', sep='\t', header=None, names=['id', 'label', 'tweet'])
train_df.head()

Unnamed: 0,id,label,tweet
0,768213876278165504,NONE,-Me caes muy bien -Tienes que jugar más parti...
1,768213567418036224,N,@myendlesshazza a. que puto mal escribo b. me...
2,768212591105703936,N,@estherct209 jajajaja la tuya y la d mucha gen...
3,768221670255493120,P,Quiero mogollón a @AlbaBenito99 pero sobretodo...
4,768221021300264964,N,Vale he visto la tia bebiendose su regla y me ...


Diccionario de polaridad, lo introduciremos al pipeline como otra feature incluida en nuestro clasificador

In [4]:
def load_polarity_dict(file_path):
    polarity_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                word, polarity = parts
                try:
                    polarity_dict[word] = float(polarity)
                except ValueError:
                    continue  
    return polarity_dict

polarity_dict = load_polarity_dict("../corpus/ElhPolar_esV1.lex")

Preproceso de los texto en los que realizaremos las siguientes labores:
1. eliminamos las menciones
2. eliminamos caracteres especiales que no nos sirven
3. convertimos a minusculas, para mayor facilidad de encontrar la misma palabra ej: CASA Casa casa
4. Tokenizar los textos en palabras y stopwords
5. Lemmatizar los textos con spaCy(morfologia de las palabras) o Steeming(sacar la raiz de las palabras)--ha funcionado mejor stemming en este caso

In [5]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords, stemmer):
        self.stopwords = stopwords
        self.stemmer = stemmer

    def preprocess(self, text):
        text = re.sub(r'@\w+', '', text)  # Eliminar menciones
        text = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ\s]', '', text)  # Eliminar caracteres especiales
        text = text.lower()  # Convertir a minúsculas
        tokens = word_tokenize(text)  # Tokenización
        tokens = [t for t in tokens if t not in self.stopwords]  # Eliminar stopwords
        tokens = [self.stemmer.stem(t) for t in tokens]  # Aplicar stemming
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.preprocess)


Clase para extraer el score total de los textos

In [6]:
class PolarityScoreExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, polarity_dict):
        self.polarity_dict = polarity_dict

    def polarity_score(self, text):
        words = text.split()
        score = sum(self.polarity_dict.get(word, 0) for word in words)
        return score

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X.apply(self.polarity_score))


In [7]:
# stopwords and stemming
spanish_stopwords = set(stopwords.words('spanish'))
spanish_stemmer = SnowballStemmer('spanish')

# Preprocess
preprocessor = TextPreprocessor(spanish_stopwords, spanish_stemmer)
train_df['tweet'] = preprocessor.transform(train_df['tweet'])
dev_df['tweet'] = preprocessor.transform(dev_df['tweet'])
test_df['tweet'] = preprocessor.transform(test_df['tweet'])

In [8]:
# Concat dev and train
combined_df = pd.concat([train_df, dev_df])
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1514 entries, 0 to 505
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1514 non-null   int64 
 1   label   1514 non-null   object
 2   tweet   1514 non-null   object
dtypes: int64(1), object(2)
memory usage: 47.3+ KB


In [9]:
combined_df.head()

Unnamed: 0,id,label,tweet
0,768213876278165504,NONE,caes bien jug part lol russel conmig tan otak ...
1,768213567418036224,N,put mal escrib b sig surr help qued rar comete...
2,768212591105703936,N,jajajaj d much gent segur pued melen muer
3,768221670255493120,P,quier mogollon sobretod rap contest wasaps
4,768221021300264964,N,val vist tia beb regl hs dad muchs grim


In [10]:
# Separar características y etiquetas
X_train = combined_df['tweet']
y_train = combined_df['label']
X_test = test_df['tweet']


In [11]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer()),  # Vectorización
        ('polarity', PolarityScoreExtractor(polarity_dict))  # Extracción de polaridad
    ])),
    ('clf', LogisticRegression(C=0.0419, class_weight='balanced', random_state=42))  # Clasificador
])
#'clf', SVC(C=0.0419, class_weight='balanced', dual=False, random_state=22)
#'clf', RandomForestClassifier(**rf_params)

In [12]:
# Entrenar el modelo con todos los datos de entrenamiento
pipeline.fit(X_train, y_train)

# Validación cruzada
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
print(f'Cross-validation F1 scores: {cv_scores}')
print(f'Mean cross-validation F1 score: {cv_scores.mean()}')

Cross-validation F1 scores: [0.50040467 0.46998999 0.47029639 0.46957153 0.52727654]
Mean cross-validation F1 score: 0.48750782422784544


Utilizando lemmatizacion conseguimos un F1 score de 0.43 aprox, por ello hemos obtado por utilizar Stemming.

In [13]:
# Predict test
y_pred_test = pipeline.predict(X_test)

# Results in df
results = pd.DataFrame({'id': test_df['id'], 'label': y_pred_test})

In [14]:
# Guardar el DataFrame en un archivo de texto con el formato requerido
results.to_csv('resultado.txt', sep='\t', index=False, header=False)