In [2]:
!pip install --quiet xgboost tqdm

In [1]:
# Saving objects
import joblib
# Data handlig & Exploratory Data Analysis (EDA):
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
# Metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('../data/cyberbullying_preprocessed.csv')
# data = [tuple(x) for x in df.values]
print('Número de datos cargados: {num}'.format(num=len(df)))

Número de datos cargados: 80909


In [3]:
# Divido los datos en dos listas 
#     X: los mensajes de texto
#     y: las etiquetas

X = df['text_preprocessed']
y = df['label'] 

pd.DataFrame({'X': X, 'y': y}).head()

Unnamed: 0,X,y
0,word food crapilicious,0
1,white,0
2,classy whore red velvet cupcake,0
3,meh thank head concern angry dude twitter,0
4,isis account pretend kurdish account like isla...,0


# Dividimos los datos en dos conjuntos: entrenamiento y test
- El conjunto de entrenamiento se utiliza para ajustar el modelo
- El conjunto de test se utiliza para evaluar el modelo
- La proporción de los datos que se utilizan para el test es del 20%
- La semilla aleatoria se fija en 0 para que los resultados sean reproducibles
- Se imprime el número de mensajes que se utilizarán para el entrenamiento y el test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

print('Número de Mensajes para el entrenamiento: {num}'.format(num=X_train.shape[0]))
print('Número de Mensajes para el test: {num}'.format(num=X_test.shape[0]))

pd.DataFrame({
    'X_train': X_train[:5].tolist(),
    'y_train': y_train[:5].tolist(),
    'X_test': X_test[:5].tolist(),
    'y_test': y_test[:5].tolist()
}).head()

Número de Mensajes para el entrenamiento: 64727
Número de Mensajes para el test: 16182


Unnamed: 0,X_train,y_train,X_test,y_test
0,love zombie movie someday yes model little bit...,0,human race stop gay retard rape insult joke pl...,1
1,remove liar racist confidence learn racism kno...,1,woman lack require woman witness replace man,1
2,cheap meat black eat lip black canadian whore ...,1,definantly,0
3,damn evil minute ipa beckon fridge right,1,oooh sucks ibuprofin,1
4,like patti labelle sing go arsenio damn song s...,0,matter,0


# Modelos de Machine Learning para entrenar.
- Regresión Logística:
- K-Nearest Neighbors
- Decision Tree
- Support Vector Machine (Linear Kernel)
- Support Vector Machine (RBF Kernel)
- Neural Network
- Random Forest
- Gradient Boosting
- XGBoost


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaluate(actual, predicted, model_name):
    PrecisionScore = precision_score(actual, predicted, pos_label=1)
    RecallScore = recall_score(actual, predicted, pos_label=1)
    F1_score = f1_score(actual, predicted, pos_label=1)
    Accuracy = accuracy_score(actual, predicted)

    return {'Model': model_name,
            'Precision': PrecisionScore,
            'Recall': RecallScore,
            'F1': F1_score,
            'Accuracy': Accuracy}


models = {#'Logistic Regression': LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000),
          #'K-Nearest Neighbors': KNeighborsClassifier(),
          #'Decision Tree': DecisionTreeClassifier(),
          'Support Vector Machine (Linear Kernel)': SVC(kernel='linear', probability=True),
          'Support Vector Machine (RBF Kernel)': SVC(kernel='rbf', probability=True),
          #'Random Forest': RandomForestClassifier(random_state=42),
          #'Gradient Boosting': GradientBoostingClassifier(),
          #'XGBoost': XGBClassifier(),
          #'Neural Network': MLPClassifier(hidden_layer_sizes=(150,100,50), activation='relu', solver='adam', max_iter=300, random_state=1),
          }

In [11]:
from tqdm import tqdm

try:
    models_results = pd.read_csv('../data/models_results.csv')
except FileNotFoundError:
    models_results = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1', 'Accuracy'])

for model_name, model in tqdm(models.items(), desc="Entrenando modelos", unit="modelo"):
    # 1. Creamos un pipeline que vectoriza los datos y entrena el modelo.
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classify', model)
    ])

    # 2. Ajustamos el modelo
    pipeline.fit(X_train, y_train)

    # 3. Predecimos los datos de test
    y_pred = pipeline.predict(X_test)

    # 4. Evaluamos el modelo
    result = evaluate(y_test, y_pred, model_name)
    
    # 5. Guardamos los resultados
    if model_name in models_results['Model'].values:
        models_results.loc[models_results['Model'] == model_name, ['Precision', 'Recall', 'F1', 'Accuracy']] = result['Precision'], result['Recall'], result['F1'], result['Accuracy']
    else:
        models_results = models_results._append(result, ignore_index=True)

    # 6. Guardamos el modelo
    joblib.dump(pipeline, f'../models/{model_name.lower().replace(" ", "_")}.pkl')

# Guardamos los resultados
models_results.to_csv('../data/models_results.csv', index=False)
print('Modelos entrenados y guardados.')

Entrenando modelos: 100%|██████████| 2/2 [1:00:05<00:00, 1802.73s/modelo]

Modelos entrenados y guardados.





> The F1-Score is a metric that considers both precision and recall, making it a good overall performance measure, especially in cases where there is an imbalance between classes.
https://penscola.medium.com/end-to-end-machine-learning-project-using-fastapi-b81e2fe150d3

In [14]:
models_results.sort_values(by=['F1'], ascending=False, inplace=True, ignore_index=True)
models_results

Unnamed: 0,Model,Precision,Recall,F1,Accuracy
0,Random Forest,0.839313,0.826818,0.833019,0.797738
1,Support Vector Machine (RBF Kernel),0.848239,0.804942,0.826024,0.793103
2,Neural Network,0.80193,0.841807,0.821384,0.776604
3,Logistic Regression,0.839142,0.792485,0.815147,0.780682
4,Support Vector Machine (Linear Kernel),0.853941,0.775673,0.812928,0.782165
5,Decision Tree,0.825427,0.798258,0.811615,0.773885
6,XGBoost,0.908856,0.712984,0.799092,0.781238
7,Gradient Boosting,0.912814,0.683917,0.781959,0.767272
8,K-Nearest Neighbors,0.773912,0.700628,0.735449,0.692436


In [15]:
# Evaluamos los modelos con un mensaje de prueba
predictions_df = pd.DataFrame(columns=['Model', 'Prediction', 'Score'])
for model_name in models.keys():
    model = joblib.load(f'../models/{model_name.lower().replace(" ", "_")}.pkl')
    prediction = model.predict(['I hate you, nigga'])
    prediction = 'Bullying' if prediction[0] == 1 else 'Not Bullying'
    score = model.predict_proba(['I hate you'])[0][1]
    # Indexamos el resultado en un Dataframe para visualizarlo al finalizar el ciclo
    predictions_df = predictions_df._append({'Model': model_name, 'Prediction': prediction, 'Score': score}, ignore_index=True)
predictions_df

  predictions_df = predictions_df._append({'Model': model_name, 'Prediction': prediction, 'Score': score}, ignore_index=True)


Unnamed: 0,Model,Prediction,Score
0,Support Vector Machine (Linear Kernel),Bullying,0.236725
1,Support Vector Machine (RBF Kernel),Bullying,0.288884
