## **MULTINOMIAL NAIVE BAYES 2 LABELS CON PROMEDIO DE PALABRAS TEXT/LABEL TFIDFVECTORIZER**


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from google.colab import drive
import joblib

# Montar Google Drive
drive.mount('/content/drive')

# Descargar el recurso 'stopwords'
nltk.download('stopwords')
nltk.download('punkt')


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Inicializar el stemmer en inglés
stemmer = SnowballStemmer('english')

# Función para realizar stemming en el texto
def stem_text(text):
    # Tokenizar el texto
    tokens = word_tokenize(text)
    # Realizar stemming en cada token y excluir las stopwords
    stemmed_words = [stemmer.stem(word) for word in tokens if word.lower() not in stopwords.words('english')]
    return ' '.join(stemmed_words)

# Cargar los datos
data = pd.read_csv('SMM4H_2024_Task3_Training_1800.csv', nrows=1800, usecols=[1, 2, 3], engine='python')

# Lematizar las columnas 'keyword' y 'text'
data['keyword'] = data['keyword'].apply(stem_text)
data['text'] = data['text'].apply(stem_text)

# Reemplazar los valores de la columna 'label' que no son 0 por 1
data['label'] = data['label'].apply(lambda x: 1 if x != 0 else x)

# Partición de los datos en train, validation y test
train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

# Calcular el promedio de palabras en el texto para cada clase
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
class_word_counts = data.groupby('label')['word_count'].mean()
print(class_word_counts)

# Normalizar los valores del promedio de palabras
max_word_count = class_word_counts.max()
min_word_count = class_word_counts.min()
normalized_class_word_counts = (class_word_counts - min_word_count) / (max_word_count - min_word_count)

# Añadir la característica normalizada como una columna adicional en los datos
train_data['normalized_word_count'] = train_data['label'].map(normalized_class_word_counts)
val_data['normalized_word_count'] = val_data['label'].map(normalized_class_word_counts)
test_data['normalized_word_count'] = test_data['label'].map(normalized_class_word_counts)

# Partición de los datos en train, validation y test
X_train = train_data[['text', 'normalized_word_count']]
X_val = val_data[['text', 'normalized_word_count']]
X_test = test_data[['text', 'normalized_word_count']]
y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

print('test_data',X_test)

# Preprocesamiento de texto y construcción del modelo
vectorizer = TfidfVectorizer()
model = MultinomialNB()

# Entrenar el modelo
X_train_text = vectorizer.fit_transform(X_train['text'])
X_train_combined = np.hstack((X_train_text.toarray(), np.array(X_train['normalized_word_count']).reshape(-1, 1)))
model.fit(X_train_combined, y_train)

# Validación del modelo
X_val_text = vectorizer.transform(X_val['text'])
X_val_combined = np.hstack((X_val_text.toarray(), np.array(X_val['normalized_word_count']).reshape(-1, 1)))
predicted_labels = model.predict(X_val_combined)

# Evaluación del modelo
print("Resultados en datos de validación:")
print(classification_report(y_val, predicted_labels))

# Prueba del modelo
X_test_text = vectorizer.transform(X_test['text'])
X_test_combined = np.hstack((X_test_text.toarray(), np.array(X_test['normalized_word_count']).reshape(-1, 1)))
predicted_labels_test = model.predict(X_test_combined)

# Evaluación del modelo en los datos de prueba
print("\nResultados en datos de prueba:")
print(classification_report(y_test, predicted_labels_test))


label
0    111.706454
1    150.692078
Name: word_count, dtype: float64
test_data                                                    text  normalized_word_count
1573  feel ya buddi , spent day loung around home , ...                    1.0
460   sure tell regard possibl swim problem futur , ...                    0.0
315   although subject frustrat , n't let beauti thi...                    0.0
1659  struggl get outsid . start thing : 'm 21 year ...                    1.0
1179  recent went run / walk . took like week actual...                    1.0
...                                                 ...                    ...
161   sever anxiety/depress . 'm 20 year old dude di...                    0.0
654   anxieti roof ! . past week two get rid one dog...                    0.0
957   n't realli tast music . younger probabl tast w...                    0.0
744                         ’ outsid though . ’ peopl .                    0.0
619   pretti much , 's exact get thing done make c

In [None]:
print(len(data['text']))

# Transformar los datos de texto de todos los datos utilizando el mismo vectorizador
X_all_data_text = vectorizer.transform(data['text'])

# Calcular el promedio de palabras en el texto para cada clase
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
class_word_counts = data.groupby('label')['word_count'].mean()

# Normalizar los valores del promedio de palabras
max_word_count = class_word_counts.max()
min_word_count = class_word_counts.min()
normalized_class_word_counts = (class_word_counts - min_word_count) / (max_word_count - min_word_count)

# Añadir la característica normalizada como una columna adicional en los datos
data['normalized_word_count'] = data['label'].map(normalized_class_word_counts)

# Transformar los datos de texto de todos los datos utilizando el mismo vectorizador
X_all_data_text = vectorizer.transform(data['text'])

# Combinar las características de texto y la característica normalizada
X_all_data_combined = np.hstack((X_all_data_text.toarray(), np.array(data['normalized_word_count']).reshape(-1, 1)))

# Predecir las etiquetas para todos los datos
predicted_labels_all_data = model.predict(X_all_data_combined)

# Evaluación del modelo en todos los datos
print("\nResultados en todos los datos:")
print(classification_report(data['label'], predicted_labels_all_data))


1800

Resultados en todos los datos:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1131
           1       1.00      1.00      1.00       669

    accuracy                           1.00      1800
   macro avg       1.00      1.00      1.00      1800
weighted avg       1.00      1.00      1.00      1800



In [None]:
# Filtrar el dataframe para obtener solo los textos con etiqueta predicha como 1
print(len(predicted_labels_all_data))
new_data_class_2 = data[data['label'] == 1]  # Filtrar el dataframe original por las predicciones 0
print(new_data_class_2)

1800
                                   keyword  \
1131                                  walk   
1132                         outsid , bike   
1133                                  walk   
1134                                   run   
1135               run , run , hors , walk   
...                                    ...   
1795                   pool , beach , pool   
1796                      outsid , outdoor   
1797                                   jog   
1798                    walk , swim , pool   
1799  roller blade , outsid , roller blade   

                                                   text  label  word_count  \
1131  feel like text send back forth pretti flirtati...      1         111   
1132  'm gon na pokemon thing get go outsid , bike ....      1          50   
1133  someth work expos anxieti . even though 'm sta...      1         266   
1134  absolut ! pleas encourag son . , make son exer...      1          93   
1135  agre feel good thing . use go run track field 

In [None]:
# Filtrar el dataframe para obtener solo los textos con etiqueta predicha como 0
new_data = data[data['label'] == 0]  # Filtrar el dataframe original por las predicciones 0
print(new_data)

                 keyword                                               text  \
0                    run  21/m . want experi young love , ve never relat...   
1                 outsid  issu talk girl enjoy , sinc ignor pms most .. ...   
2     run , swim , climb  need advic free social activ . ve come shell b...   
3                   walk  spoke today .. week ago met girl colleg biolog...   
4                 outsid  get social anxieti ? . hello , day girlfriend ...   
...                  ...                                                ...   
1126           basketbal  stiff . social anxieti make stiff ’ like muscl...   
1127                 run  struggl set boundari friend alway come back bi...   
1128                pool  think 'm make progress , realis far still go ....   
1129                 run  help ? : ( . ’ know make friend . ’ 21 year ol...   
1130       outsid , walk  random peopl take insecur ? . outsid break lad...   

      label  word_count  normalized_word_count  
0 

In [None]:
# Guardar el modelo en un archivo
joblib.dump(model, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Copiar el archivo del modelo a Google Drive
!cp naive_bayes_model.pkl '/content/drive/My Drive/'
!cp vectorizer.pkl '/content/drive/My Drive/'



---

## **SENTIMENT ANALYSIS**

## **MODEL: cardiffnlp/twitter-roberta-base-sentiment**


In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Cargar el dataframe resultante del primer código
data = pd.read_csv('SMM4H_2024_Task3_Training_1800.csv', nrows=1800, usecols=[0, 1, 2, 3], engine='python')

# Preprocesamiento de los textos para el clasificador
formatted_texts = new_data_class_2['text'].apply(lambda text: f"('{text}')")

# Crear el clasificador de sentimiento
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
sentiment_task = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Realizar las predicciones para los nuevos textos
predictions_5 = []
x = 0
for i, text in enumerate(formatted_texts):
    try:
        # Tokenizar el texto para obtener los token_type_ids
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Extraer la cadena de texto del tensor de entrada
        text_input = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
        # Hacer la predicción con la cadena de texto
        output_5 = sentiment_task(text_input)
        # Obtener la etiqueta de la predicción
        label = output_5[0]['label']
        # Convertir la etiqueta a su valor numérico
        if label == 'LABEL_2':
            predictions_5.append(1)
        elif label == 'LABEL_1':
            predictions_5.append(2)
        else:
            predictions_5.append(3)
    except Exception as e:
        print(f"Error al procesar texto {text}: {e}")
        # Asignar un valor de 3 a la predicción en caso de error
        predictions_5.append(3)
    x += 1
    print(f"Texto {x}: {output_5}")

print(predictions_5)


Texto 1: [{'label': 'LABEL_2', 'score': 0.6311589479446411}]
Texto 2: [{'label': 'LABEL_2', 'score': 0.7894551753997803}]
Texto 3: [{'label': 'LABEL_0', 'score': 0.5005853772163391}]
Texto 4: [{'label': 'LABEL_2', 'score': 0.4797581732273102}]
Texto 5: [{'label': 'LABEL_1', 'score': 0.54304438829422}]
Texto 6: [{'label': 'LABEL_1', 'score': 0.7196619510650635}]
Texto 7: [{'label': 'LABEL_1', 'score': 0.5181955695152283}]
Texto 8: [{'label': 'LABEL_0', 'score': 0.593110978603363}]
Texto 9: [{'label': 'LABEL_1', 'score': 0.5358076095581055}]
Texto 10: [{'label': 'LABEL_0', 'score': 0.5516340136528015}]
Texto 11: [{'label': 'LABEL_2', 'score': 0.980198085308075}]
Texto 12: [{'label': 'LABEL_2', 'score': 0.9586889743804932}]
Texto 13: [{'label': 'LABEL_1', 'score': 0.5506689548492432}]
Texto 14: [{'label': 'LABEL_1', 'score': 0.5819634199142456}]
Texto 15: [{'label': 'LABEL_2', 'score': 0.5148200988769531}]
Texto 16: [{'label': 'LABEL_1', 'score': 0.49596545100212097}]
Texto 17: [{'label':

In [None]:
# Obtener las etiquetas originales del conjunto de datos
y_true = data['label'].iloc[-669:]
print(y_true)

# Calcular el conteo de cada clase
class_counts = {label: predictions_5.count(label) for label in set(predictions_5)}

# Mostrar el conteo de cada clase
print("Conteo de cada clase:")
for label, count in class_counts.items():
    print(f"Clase {label}: {count} instancias")

# Crear el classification_report con las etiquetas originales del DataFrame 'data'
print("\nClassification Report:")
print(classification_report(y_true, predictions_5))

1131    1
1132    1
1133    1
1134    1
1135    1
       ..
1795    3
1796    3
1797    3
1798    3
1799    3
Name: label, Length: 669, dtype: int64
Conteo de cada clase:
Clase 1: 89 instancias
Clase 2: 362 instancias
Clase 3: 218 instancias

Classification Report:
              precision    recall  f1-score   support

           1       0.48      0.27      0.35       160
           2       0.61      0.56      0.58       395
           3       0.27      0.51      0.35       114

    accuracy                           0.48       669
   macro avg       0.45      0.45      0.43       669
weighted avg       0.52      0.48      0.49       669



In [None]:
# Crear un DataFrame con el texto y las predicciones
new_data_with_predictions = pd.DataFrame({'text': new_data_class_2['text'], 'label': predictions_5})

# Mostrar el nuevo DataFrame
print(new_data_with_predictions)

print(new_data)

                                                   text  label
1131  feel like text send back forth pretti flirtati...      1
1132  'm gon na pokemon thing get go outsid , bike ....      1
1133  someth work expos anxieti . even though 'm sta...      3
1134  absolut ! pleas encourag son . , make son exer...      1
1135  agre feel good thing . use go run track field ...      2
...                                                 ...    ...
1795  social anxieti : essenti . look around , felt ...      2
1796  eye contact . public . ca n't . hi , 'm 17 tho...      3
1797  look light , find . look dark , ever see.nnmi ...      2
1798  feel like sa ruin life . hi , hope okay post ....      3
1799  got social anxieti tri roller blade . 24 year ...      2

[669 rows x 2 columns]
                 keyword                                               text  \
0                    run  21/m . want experi young love , ve never relat...   
1                 outsid  issu talk girl enjoy , sinc ignor pm

In [None]:
# Concatenar solo las columnas 'text' y 'label' de 'new_data' al DataFrame 'new_data_with_predictions'
pred_all_texts = pd.concat([new_data[['text', 'label']], new_data_with_predictions])

# Mostrar el nuevo DataFrame
print(pred_all_texts)
print(data)

# Evaluación del modelo en todos los datos
print("\nResultados en todos los datos:")
print(classification_report(data['label'], pred_all_texts['label']))

                                                   text  label
0     21/m . want experi young love , ve never relat...      0
1     issu talk girl enjoy , sinc ignor pms most .. ...      0
2     need advic free social activ . ve come shell b...      0
3     spoke today .. week ago met girl colleg biolog...      0
4     get social anxieti ? . hello , day girlfriend ...      0
...                                                 ...    ...
1795  social anxieti : essenti . look around , felt ...      2
1796  eye contact . public . ca n't . hi , 'm 17 tho...      3
1797  look light , find . look dark , ever see.nnmi ...      2
1798  feel like sa ruin life . hi , hope okay post ....      3
1799  got social anxieti tri roller blade . 24 year ...      2

[1800 rows x 2 columns]
           id                                keyword  \
0      3u2w5k                                    run   
1      3xbury                                outside   
2      3y743u                       run, swim, clim