## Procesamiento de Datos

In [1]:
import time
start=time.time()

In [2]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer 
import codecs
import pandas as pd
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Algoritmos de clasificación
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

In [4]:
#Preparación y evaluación de los modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#visualización de datos
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
import plotly.io as pio
pio.renderers.default = "colab"
import seaborn as sb
import matplotlib.pyplot as plt

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/MAESTRÍA/TETRAMESTRE 4/PROCESAMIENTO Y CLASIFICACIÓN/MINIPROYECTO 1/BDSuicidios3.csv")
df.head()

Unnamed: 0,Text,Clasificación
0,Ex Wife Threatening SuicideRecently I left my ...,Suicidio
1,i need helpjust help me im crying so hard,Suicidio
2,Honetly idkI dont know what im even doing here...,Suicidio
3,My life is over at 20 years oldHello all. I am...,Suicidio
4,Can you imagine getting old? Me neither.Wrinkl...,Suicidio


In [8]:
df_counts= pd.DataFrame(df[['Clasificación']].value_counts()).reset_index().rename(columns={0:'count','Clasificación':'class'})
df_counts

Unnamed: 0,class,count
0,No Suicidio,1000
1,Suicidio,1000


In [9]:
px.pie(data_frame=df_counts,names='class',values='count',hole=.5,color_discrete_sequence=['steelblue','lightgray'])

In [10]:
df.shape

(2000, 2)

In [11]:
import re

# Define a function to clean the text
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text) 
    return text

# Cleaning the text in the review column
df['Cleaned Posts'] = df['Text'].apply(clean)
df.head()

Unnamed: 0,Text,Clasificación,Cleaned Posts
0,Ex Wife Threatening SuicideRecently I left my ...,Suicidio,Ex Wife Threatening SuicideRecently I left my ...
1,i need helpjust help me im crying so hard,Suicidio,i need helpjust help me im crying so hard
2,Honetly idkI dont know what im even doing here...,Suicidio,Honetly idkI dont know what im even doing here...
3,My life is over at 20 years oldHello all. I am...,Suicidio,My life is over at years oldHello all I am a y...
4,Can you imagine getting old? Me neither.Wrinkl...,Suicidio,Can you imagine getting old Me neither Wrinkle...


In [12]:
features = df.iloc[:, 0].values
labels = df.iloc[:, 1].values

In [13]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [14]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

## Clasificación de Texto

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=123)

In [16]:
def build_model(model,params,x_train,x_test,y_train,y_test):
  a = time.time()

  #Se realiza una búsqueda en malla con los parámetros indicados para obtener la mejor combinación
  training = GridSearchCV(estimator = model, 
                               param_grid = params, 
                               cv = 3, 
                               return_train_score=True).fit(X_train,y_train)

  #Se obtienen los mejores parámetros                              
  best_params = training.best_params_
  cv_results = pd.DataFrame(training.cv_results_)

  #Se obtienen los resultados del cv
  mean_test_score = cv_results[cv_results['params']==best_params]['mean_test_score']
  std_test_score = cv_results[cv_results['params']==best_params]['std_test_score']

  #Se hacen las predicciones
  pred = training.predict(X_test)

  #Se obtienen las métricas de evaluación
  acc_score = metrics.accuracy_score(y_test, pred)
  class_report = metrics.classification_report(y_test, pred)
  conf_matr = metrics.confusion_matrix(y_test, pred)

  print('================================================================================')
  print('Model:',model)
  print('--------------------------------------------------------------------------------')
  print('Best Params:')
  print(best_params)
  print('--------------------------------------------------------------------------------')
  print('Accuracy Score:',acc_score)
  print('--------------------------------------------------------------------------------')
  print('Classification Report:')
  print(class_report)
  print('--------------------------------------------------------------------------------')
  print('Confusion Matrix:')
  print(conf_matr)
  b = time.time()
  return pred,best_params,mean_test_score,std_test_score,acc_score,class_report,conf_matr,b-a

### Clasificación de texto con Random Forest

In [17]:
#Random Forest Classifier
RFC = RandomForestClassifier()
RFC_params = {'n_estimators':[10,25,50,100,200],
              'max_features':['auto', 'sqrt'],
              'random_state':[123]}
RFC_model = build_model(RFC,RFC_params,X_train,X_test,y_train,y_test)

Model: RandomForestClassifier()
--------------------------------------------------------------------------------
Best Params:
{'max_features': 'auto', 'n_estimators': 200, 'random_state': 123}
--------------------------------------------------------------------------------
Accuracy Score: 0.8725
--------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

 No Suicidio       0.85      0.89      0.87       193
    Suicidio       0.89      0.86      0.87       207

    accuracy                           0.87       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.87      0.87       400

--------------------------------------------------------------------------------
Confusion Matrix:
[[172  21]
 [ 30 177]]


### Clasificación de texto con KNeighbors

In [18]:
#K Nearest Neighbors
KNN = KNeighborsClassifier()
KNN_params = {'metric':['euclidean', 'manhattan', 'minkowski'],
              'weights':['uniform', 'distance'],
              'n_neighbors':[2,4,6,8,10]}
KNN_model = build_model(KNN,KNN_params,X_train,X_test,y_train,y_test)

Model: KNeighborsClassifier()
--------------------------------------------------------------------------------
Best Params:
{'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'distance'}
--------------------------------------------------------------------------------
Accuracy Score: 0.7825
--------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

 No Suicidio       0.69      0.98      0.81       193
    Suicidio       0.98      0.59      0.74       207

    accuracy                           0.78       400
   macro avg       0.83      0.79      0.78       400
weighted avg       0.84      0.78      0.77       400

--------------------------------------------------------------------------------
Confusion Matrix:
[[190   3]
 [ 84 123]]


### Clasificación de texto con Naive Bayes

In [19]:
#Multinomial Naive Bayes Classifier
MNB = MultinomialNB()
MNB_params = {'alpha':[.01, .1, .5, 1, 10]}
MNB_model = build_model(MNB,MNB_params,X_train,X_test,y_train,y_test)

Model: MultinomialNB()
--------------------------------------------------------------------------------
Best Params:
{'alpha': 0.5}
--------------------------------------------------------------------------------
Accuracy Score: 0.8525
--------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

 No Suicidio       0.92      0.76      0.83       193
    Suicidio       0.81      0.94      0.87       207

    accuracy                           0.85       400
   macro avg       0.86      0.85      0.85       400
weighted avg       0.86      0.85      0.85       400

--------------------------------------------------------------------------------
Confusion Matrix:
[[146  47]
 [ 12 195]]


### Clasificación de texto con Support Vector Machine

In [20]:
#Support Vector Machines
SVM = SVC()
SVM_params = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
               'C': [0.1, 1, 100, 1000],
              'random_state':[123]}
SVM_model = build_model(SVM,SVM_params,X_train,X_test,y_train,y_test)

Model: SVC()
--------------------------------------------------------------------------------
Best Params:
{'C': 1, 'kernel': 'sigmoid', 'random_state': 123}
--------------------------------------------------------------------------------
Accuracy Score: 0.89
--------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

 No Suicidio       0.85      0.93      0.89       193
    Suicidio       0.93      0.85      0.89       207

    accuracy                           0.89       400
   macro avg       0.89      0.89      0.89       400
weighted avg       0.89      0.89      0.89       400

--------------------------------------------------------------------------------
Confusion Matrix:
[[180  13]
 [ 31 176]]


### Clasificación de texto con Stochastic Gradient Descent

In [21]:
#Stochastic Gradient Descent Classifier
SGD = SGDClassifier()
SGD_params = {'penalty':['l2','l1'],
               'max_iter': [50,100,1000],
              'random_state':[123]}
SGD_model = build_model(SGD,SGD_params,X_train,X_test,y_train,y_test)

Model: SGDClassifier()
--------------------------------------------------------------------------------
Best Params:
{'max_iter': 50, 'penalty': 'l1', 'random_state': 123}
--------------------------------------------------------------------------------
Accuracy Score: 0.8675
--------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

 No Suicidio       0.83      0.91      0.87       193
    Suicidio       0.91      0.83      0.87       207

    accuracy                           0.87       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.87      0.87       400

--------------------------------------------------------------------------------
Confusion Matrix:
[[175  18]
 [ 35 172]]


## Comparativo de modelos

In [22]:
resultados = pd.DataFrame({'model':['Random Forest','SGD','SVM','KNN','MultiNB'],
                          'accuracy':[RFC_model[4],SGD_model[4],SVM_model[4],KNN_model[4],MNB_model[4]],
                           'mean_accuracy':[RFC_model[2].values[0],SGD_model[2].values[0],SVM_model[2].values[0],KNN_model[2].values[0],MNB_model[2].values[0]],
                           'std_accuracy':[RFC_model[3].values[0],SGD_model[3].values[0],SVM_model[3].values[0],KNN_model[3].values[0],MNB_model[3].values[0]],
                           'exec_time':[RFC_model[7],SGD_model[7],SVM_model[7],KNN_model[7],MNB_model[7]]})
resultados

Unnamed: 0,model,accuracy,mean_accuracy,std_accuracy,exec_time
0,Random Forest,0.8725,0.864996,0.009367,37.611488
1,SGD,0.8675,0.860626,0.003157,3.869216
2,SVM,0.89,0.885,0.013032,113.941996
3,KNN,0.7825,0.81562,0.005954,113.508094
4,MultiNB,0.8525,0.853755,0.014725,0.651078


In [23]:
px.scatter(resultados,x='exec_time',y='mean_accuracy',size='std_accuracy',color='model',opacity=0.7)

In [24]:
px.bar(resultados,x='accuracy',color='model',opacity=0.7)

In [25]:
end=time.time()
print("Tiempo de ejecución:",end-start,'segundos')

Tiempo de ejecución: 275.64686012268066 segundos
