In [274]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt



## SOBRA


In [275]:
sport_df = pd.read_excel('glossary/deportes.xlsx', names = ['word', 'rank'] , usecols = [0,1])
sport_df['class'] = 0

health_df = pd.read_excel('glossary/salud.xlsx', names = ['word', 'rank'] , usecols = [0,1])
health_df['class'] = 1

politics_df = pd.read_excel('glossary/politica.xlsx', names = ['word', 'rank'] , usecols = [0,1])
politics_df['class'] = 2

In [276]:
glossariy_df = pd.concat([sport_df, health_df, politics_df])
glossariy_df = glossariy_df.drop_duplicates(subset='word', keep = False)
glossariy_df.reset_index(drop=True, inplace=True)

In [277]:
df_list = [glossariy_df[glossariy_df['class'] == 0][:100], glossariy_df[glossariy_df['class'] == 1][:100], glossariy_df[glossariy_df['class'] == 2][:100]]

glossariy_df = pd.concat(df_list, ignore_index=True)

## --------------------------------------------------

In [278]:
train_df = pd.read_excel('./corpus/train.xlsx', usecols=[1,2])
test_df = pd.read_excel('./corpus/test.xlsx', usecols=[1,2])

1.- Poner textos de train y test procesados sin stop words en un csv anotando al aldo su clase, divididos
2.- Glosario solo con palabras y sus respectivas clses


Multinomial – It is used for Discrete Counts. The one we described in the example above is an example of Multinomial Type Naïve Bayes.
https://hands-on.cloud/implementing-naive-bayes-classification-using-python/

In [279]:
train_df

Unnamed: 0,document,class
0,san josé oct efe presidente costa rica rodrigo...,2
1,agencia española medicamentos productos sanita...,1
2,ministro japón naoto kan influyente parlamenta...,2
3,dolor síntoma señal alertaba enfermedad acerca...,1
4,erróneo antibióticos problemas graves enfrenta...,1
...,...,...
85,carrera salvaje caeleb dressel confirmó rey pi...,0
86,kim jongun recuperó atención desarrollo arsena...,2
87,australia puertas éxito finales copa davis cae...,0
88,líder norcoreano kim jongun decidido acelerar ...,2


In [280]:
myvocabulary = list(glossariy_df['word'])

In [281]:
myvocabulary

['equipos',
 'juegos',
 'arco',
 'tiro',
 'federación',
 'balón',
 'oro',
 'set',
 'brown',
 'serbia',
 'individual',
 'gol',
 'raptors',
 'alvariño',
 'asistencias',
 'tokio',
 'bull',
 'quinto',
 'plata',
 'kyrgios',
 'alcaraz',
 'clasificación',
 'siakam',
 'mclaren',
 'audi',
 'open',
 'semifinales',
 'semifinal',
 'rebotes',
 'olímpicos',
 'an',
 'parís',
 'olímpico',
 'bronce',
 'campeonato',
 'jugadores',
 'podría ser',
 'rfeta',
 'juegos olímpicos',
 'ganó',
 'acabó',
 'afición',
 'podio',
 'miguel',
 'piscina',
 'medallas',
 'título',
 'eurobasket',
 'competición',
 'medalla',
 'estilos',
 'tiros',
 'lesión',
 'dressel',
 'cuarto',
 'plazas',
 'lewis',
 'australia',
 'us open',
 'us',
 'izquierdo',
 'victorias',
 'circuito',
 'miguel alvariño',
 'milak',
 'celtics',
 'albacete',
 'mirandés',
 'perdió',
 'campeón',
 'tenista',
 'contenta',
 'nurse',
 'ronda',
 'actitud',
 'garnacho',
 'nadal',
 'entrenador',
 'nadador',
 'montes',
 'raducanu',
 'torneos',
 'anteriores',
 'copa 

In [282]:
train_df.iloc[:,1]

0     2
1     1
2     2
3     1
4     1
     ..
85    0
86    2
87    0
88    2
89    0
Name: class, Length: 90, dtype: int64

## Generacion de la matriz TF-IDF a partir del glosario extraído

In [283]:
vectorizer = TfidfVectorizer(vocabulary = myvocabulary)
x_train = vectorizer.fit_transform(train_df.iloc[:,0])
x_train = x_train.toarray()

x_test  = vectorizer.transform(test_df.iloc[:,0])
x_test = x_test.toarray()

y_train = train_df.iloc[:,1]
y_test  = test_df.iloc[:,1]


In [284]:
df_x_train = pd.DataFrame(x_train)
df_x_train.columns = myvocabulary
y_test

0     0
1     1
2     2
3     0
4     1
     ..
85    0
86    1
87    1
88    0
89    2
Name: class, Length: 90, dtype: int64

In [285]:
df_y_train = train_df.iloc[:,1]
df_y_train

0     2
1     1
2     2
3     1
4     1
     ..
85    0
86    2
87    0
88    2
89    0
Name: class, Length: 90, dtype: int64

In [286]:
target_names = ['Deportes', 'Salud', 'Politica']

#### Entrenamiento del modelo y obtencion de la precisión del mismo

In [287]:
def try_model(model, x_train, y_train, x_test, y_test, model_name):
    model.fit(x_train, y_train)
    predicciones = model.predict(x_test)
    print(accuracy_score(predicciones, y_test))
    print(predicciones)
    predicciones_prob = get_prediction_metrics(model, model_name, x_test) 
    get_report(predicciones, target_names, y_test, model_name)
    #get_confussion_matrix(model, y_test, predicciones)
    return predicciones, predicciones_prob
  

In [288]:
def get_prediction_metrics(model, model_name, x_test):
  predicciones_prob = model.predict_proba(x_test)
  print(predicciones_prob)
  predicciones_rounded = [np.round(x,2) for x in predicciones_prob]

  df_predicciones = pd.DataFrame(predicciones_rounded)
  df_predicciones.columns=['Deportes', 'Salud', 'Politica']
  df_predicciones.index.name = 'Documento'
  df_predicciones.to_excel(f"./resultados/Naive Bayes/predicciones-{model_name}.xlsx")
  return predicciones_prob


#### Obtención de informe sobre los reusltados de la clasificación

In [289]:
def get_report(predicciones, target_names, y_test, model_name):
  informe = classification_report(y_test, predicciones, target_names=target_names, digits=3)
  print(informe)


  informe = classification_report(y_test, predicciones, target_names=target_names, digits=3, output_dict=True)
  df_informe = pd.DataFrame(informe).transpose()

  df_informe.to_excel(f"./resultados/Naive Bayes/informe--{model_name}.xlsx", index=True)

  print(df_informe)

#### Obtención de la matriz de confusion del modelo


In [290]:
def get_confussion_matrix(model, y_test, predicciones):
  cm = confusion_matrix(y_test, predicciones, labels=model.classes_)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
  disp.plot()
  plt.show()


In [291]:
def predictions_to_excel(predicciones, prob_predicciones):
    df_predicciones = pd.DataFrame()
    df_predicciones["predicted_class"] = []
    df_predicciones["probability"] = []
    
    for i in range(len(predicciones)):
        df_predicciones.loc[len(df_predicciones)] = [predicciones[i], prob_predicciones[i][predicciones[i]]]
    return df_predicciones
    

### Multinomial Naive Bayes

In [292]:
multinomial_naive_bayes = MultinomialNB()
predicciones,prob_predicciones = try_model(multinomial_naive_bayes, x_train, y_train, x_test, y_test,'multinomial')

df_predicciones = predictions_to_excel(predicciones,prob_predicciones)
print(y_test)
print(predicciones)
print(df_predicciones)

0.9666666666666667
[0 1 2 0 1 2 0 0 1 0 1 2 0 2 1 0 2 0 2 0 1 0 0 1 0 0 2 0 0 2 2 2 2 2 1 1 1
 0 1 0 2 2 1 0 2 0 0 0 0 2 1 0 1 2 2 1 0 0 1 1 2 0 2 1 1 1 1 0 1 2 0 2 1 2
 1 2 2 1 1 2 1 1 1 1 0 0 1 1 0 2]
[[0.64065205 0.1806584  0.17868955]
 [0.2259859  0.56770513 0.20630897]
 [0.1738096  0.22721534 0.59897506]
 [0.7568656  0.11738275 0.12575165]
 [0.12380309 0.75402941 0.1221675 ]
 [0.21423952 0.22171436 0.56404612]
 [0.79092991 0.10462005 0.10445004]
 [0.57467917 0.24830853 0.1770123 ]
 [0.19282324 0.49726445 0.3099123 ]
 [0.78913535 0.10108455 0.1097801 ]
 [0.14643343 0.71498299 0.13858358]
 [0.19895084 0.2006395  0.60040966]
 [0.67961513 0.15901385 0.16137101]
 [0.20405945 0.20935603 0.58658452]
 [0.23061515 0.52718156 0.2422033 ]
 [0.53237697 0.2450365  0.22258653]
 [0.19121797 0.22494283 0.5838392 ]
 [0.71295824 0.14362852 0.14341324]
 [0.13248916 0.17037964 0.6971312 ]
 [0.74347701 0.12969763 0.12682536]
 [0.16518409 0.6527458  0.18207012]
 [0.68723609 0.15648003 0.15628388]
 [0.7

In [293]:
test_df =  pd.read_excel('./corpus/test.xlsx', usecols=[1,2,3])
df_predcciones_total = pd.concat([test_df,df_predicciones],axis=1).ffill()                             
df_predcciones_total.to_excel("./corpus/test-predicciones.xlsx")

### Gaussian Naive Bayes

In [294]:

gaussian_naive_bayes = GaussianNB()
try_model(gaussian_naive_bayes, x_train, y_train, x_test, y_test, 'gaussiano')

0.9444444444444444
[0 1 2 0 1 2 0 0 1 0 1 2 0 2 1 0 2 0 2 0 1 0 0 1 0 0 2 0 2 2 2 2 2 2 1 1 1
 0 1 0 2 2 1 0 2 0 0 0 0 2 1 0 1 2 2 1 0 0 1 1 2 0 2 1 1 1 1 0 1 2 0 2 1 2
 1 2 2 1 1 2 1 1 0 1 0 0 1 1 0 2]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 

(array([0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1, 2, 0, 2, 1, 0, 2, 0, 2, 0, 1, 0,
        0, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 1, 0, 2, 2, 1, 0,
        2, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 1, 0, 0, 1, 1, 2, 0, 2, 1, 1, 1,
        1, 0, 1, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1,
        0, 2], dtype=int64),
 array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0.

### Brenouilli Naive Bayes

In [295]:

bernouilli_naive_bayes = BernoulliNB()
try_model(bernouilli_naive_bayes, x_train, y_train, x_test, y_test,  'brenoulli')

0.9333333333333333
[0 1 2 0 1 2 0 1 1 0 1 2 0 2 1 1 2 0 2 0 1 0 0 1 0 0 2 0 2 2 2 2 2 2 1 1 1
 1 1 0 2 2 1 0 2 0 0 0 0 2 1 0 1 2 2 1 0 0 1 1 2 0 2 1 1 1 1 0 1 2 0 2 1 2
 1 2 2 1 1 2 1 1 1 1 0 0 1 1 0 2]
[[9.99996308e-01 2.50350572e-06 1.18830869e-06]
 [1.22287876e-03 9.98107666e-01 6.69455650e-04]
 [8.04177500e-08 3.45516883e-05 9.99965368e-01]
 [9.99997201e-01 6.75178478e-07 2.12400072e-06]
 [3.45720688e-08 9.99999743e-01 2.22546187e-07]
 [3.08035586e-03 8.95175560e-02 9.07402088e-01]
 [9.99999998e-01 1.23525316e-09 1.21173343e-09]
 [3.33107734e-01 6.60046046e-01 6.84621959e-03]
 [9.19055435e-07 7.20873539e-01 2.79125542e-01]
 [1.00000000e+00 3.12623954e-12 1.76052138e-11]
 [6.19233674e-08 9.99999829e-01 1.08712110e-07]
 [2.47736456e-07 1.04765689e-06 9.99998705e-01]
 [9.99999956e-01 1.43421880e-08 3.01480863e-08]
 [5.71926354e-06 8.04223945e-05 9.99913858e-01]
 [1.31101563e-04 9.96131527e-01 3.73737189e-03]
 [1.19344348e-01 7.91235176e-01 8.94204765e-02]
 [2.55093576e-07 2.37734273e-

(array([0, 1, 2, 0, 1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 1, 1, 2, 0, 2, 0, 1, 0,
        0, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2, 1, 0,
        2, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 1, 0, 0, 1, 1, 2, 0, 2, 1, 1, 1,
        1, 0, 1, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 2], dtype=int64),
 array([[9.99996308e-01, 2.50350572e-06, 1.18830869e-06],
        [1.22287876e-03, 9.98107666e-01, 6.69455650e-04],
        [8.04177500e-08, 3.45516883e-05, 9.99965368e-01],
        [9.99997201e-01, 6.75178478e-07, 2.12400072e-06],
        [3.45720688e-08, 9.99999743e-01, 2.22546187e-07],
        [3.08035586e-03, 8.95175560e-02, 9.07402088e-01],
        [9.99999998e-01, 1.23525316e-09, 1.21173343e-09],
        [3.33107734e-01, 6.60046046e-01, 6.84621959e-03],
        [9.19055435e-07, 7.20873539e-01, 2.79125542e-01],
        [1.00000000e+00, 3.12623954e-12, 1.76052138e-11],
        [6.19233674e-08, 9.99999829e-01, 1.08712110e-07],
        [2.47736456e-07, 1.04765689e-