# ML_sentiment_analysis

Notebook para entrenamiento de los modelos de ML para analisis de sentimientos

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
import joblib
import os

In [None]:
os.makedirs("/kaggle/working/ml_sentiment_model", exist_ok=True)

In [None]:
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='ISO-8859-1')  
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')  


In [None]:
def clean_text(text):
    import re

    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)  # Remove mentions
    text = re.sub(r"#", '', text)  # Remove hashtag symbol only
    return text.strip().lower()

In [None]:
df['text'] = df['text'].fillna('')
df['text'] = df['text'].astype(str).apply(clean_text)
df = df.dropna(subset=['sentiment'])
test_df = test_df.dropna(subset=['sentiment'])
test_df['text'] = test_df['text'].fillna('')
test_df['text'] = test_df['text'].astype(str).apply(clean_text)

train_label_encoder = LabelEncoder()
df['sentiment_label'] = train_label_encoder.fit_transform(df['sentiment'])

test_label_encoder = LabelEncoder()
test_df['sentiment_label'] = test_label_encoder.fit_transform(test_df['sentiment'])

# SVM

In [None]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", SVC( C=1.6096091371852976, 
                kernel='rbf', 
                gamma='scale'))
])


svm_pipeline.fit(df['text'], df['sentiment_label'])

In [None]:
y_pred = svm_pipeline.predict(test_df['text'])
print(classification_report(test_df['sentiment_label'], y_pred, target_names=train_label_encoder.classes_))

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model/svm", exist_ok=True)

# Save model
joblib.dump(svm_pipeline, '/kaggle/working/bert_sentiment_model/svm/svm.joblib')

# Random Forest

In [None]:
rf_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("rf", RandomForestClassifier(n_estimators=120,
                                 min_samples_split=9,
                                 min_samples_leaf=1,
                                 max_features='sqrt'))
])

rf_pipeline.fit(df['text'], df['sentiment_label'])

In [None]:
y_pred = rf_pipeline.predict(test_df['text'])
print(classification_report(test_df['sentiment_label'], y_pred, target_names=train_label_encoder.classes_))

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model/rf", exist_ok=True)

# Save model
joblib.dump(rf_pipeline, '/kaggle/working/bert_sentiment_model/rf/rf.joblib')

# Regresión Logistica

In [None]:
lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("lr", LogisticRegression(
                            C=1.2990598194518517,
                            penalty='l2',
                            solver='saga',
                            max_iter=958,))
])

lr_pipeline.fit(df['text'], df['sentiment_label'])

In [None]:
y_pred = lr_pipeline.predict(test_df['text'])
print(classification_report(test_df['sentiment_label'], y_pred, target_names=train_label_encoder.classes_))

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model/lr", exist_ok=True)

# Save model
joblib.dump(lr_pipeline, '/kaggle/working/bert_sentiment_model/lr/lr.joblib')

# KNN

In [None]:
knn_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("knn", KNeighborsClassifier(
                                n_neighbors=49,
                                weights='distance',
                                algorithm='auto',
                                p=2,
                                leaf_size=45
    ))
])
knn_pipeline.fit(df['text'], df['sentiment_label'])

In [None]:
y_pred = knn_pipeline.predict(test_df['text'])
print(classification_report(test_df['sentiment_label'], y_pred, target_names=train_label_encoder.classes_))

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model/knn", exist_ok=True)

# Save model
joblib.dump(knn_pipeline, '/kaggle/working/bert_sentiment_model/knn/knn.joblib')

# Naive Bayes

In [None]:
nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("NB", MultinomialNB(alpha=1.02832361139844, fit_prior=False))
])
nb_pipeline.fit(df['text'], df['sentiment_label'])

In [None]:
y_pred = nb_pipeline.predict(test_df['text'])
print(classification_report(test_df['sentiment_label'], y_pred, target_names=train_label_encoder.classes_))

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model/nb", exist_ok=True)

# Save model
joblib.dump(nb_pipeline, '/kaggle/working/bert_sentiment_model/nb/nb.joblib')