<a href="https://colab.research.google.com/github/juanknebel/text-mining-2020/blob/master/2_Clasificacion_de_textos_(Bayes).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clasificación de textos: Naive Bayes 

In [0]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import random

from nltk import word_tokenize, sent_tokenize

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
movie_reviews.categories()

In [0]:
neg_review_fileids = movie_reviews.fileids('neg')
neg_review_fileids[:10]

In [0]:
print(movie_reviews.raw(neg_review_fileids[2]))

In [0]:
documents = []
 
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.raw(fileid), category))

df = pd.DataFrame(documents, columns=['review', 'tag'])


In [0]:
df = df.sample(frac=1,random_state=0).reset_index(drop=True).copy()
df.head()

In [0]:
df.tag.value_counts()

In [0]:
train_test_split?


In [0]:
# separamos entre train y test sets
X = df.review
y = df.tag
X_train_text, X_test_text, y_train, y_test = train_test_split(X,y,stratify=y, test_size=0.20, random_state=1234)

In [0]:
X_train_text.tolist()[:10]

In [0]:
# Limpieza del texto
def clean_text(text):
  # En este caso particular no hace falta limpiar los textos
  return text.strip()

In [0]:
def tokenizer(text):
    #return [w for w in word_tokenize(text) if w.isalpha()] # si solo nos interesan palabras
    return word_tokenize(text)
tokenizer(clean_text(X_train_text.iloc[1]))[:10]

## extraccion de Features


In [0]:
stoplist = stopwords.words("english")

In [0]:
stoplist[:15]

In [0]:
# tokenizo el stoplist con el mismo tokenizar que voy a usar en el corpus
stoplist_tokenized = []
for w in stoplist:
    stoplist_tokenized = stoplist_tokenized + tokenizer(w)

In [0]:
stoplist_tokenized[:15]

In [0]:
# Armo una lista sin repeticiones
stoplist_tokenized = list(set(stoplist_tokenized))

In [0]:
?CountVectorizer

In [0]:
count_vect = CountVectorizer(preprocessor=clean_text, tokenizer=tokenizer, min_df=5,stop_words=stoplist_tokenized)
X_train = count_vect.fit_transform(X_train_text) # cuenta frecuencia de tokens y define el diccionario
X_test = count_vect.transform(X_test_text) # cuenta frecuencia de tokens existentes en el diccionario
X_train

In [0]:
count_vect.get_feature_names()[:20]

# Naive Bayes

In [0]:
clf = MultinomialNB(alpha=1) # alpha es el smoothing parameter
scores_cv_nb = cross_val_score(clf,X_train, y_train,cv = 10, scoring='f1_macro')
scores_cv_nb


In [0]:
print("f-score=",round(scores_cv_nb.mean(),4)," ( sd =",round(scores_cv_nb.std(),4),")")

### Dummy

In [0]:
clf_dummy = DummyClassifier()
scores_dummy_cv = cross_val_score(clf_dummy,X_train, y_train,cv = 10, scoring='f1_macro')
print("f-score=",round(scores_dummy_cv.mean(),4)," ( sd =",round(scores_cv_nb.std(),4),")\n")

## N-gramas

In [0]:
# incluyo bigramas (aclaracion:si aparece "best friend" tambien va a contar para "best" y para "friend")
count_vect = CountVectorizer(preprocessor=clean_text, 
                             tokenizer=tokenizer, 
                             min_df=5,
                             stop_words=stoplist_tokenized,
                             ngram_range=(1,2))
X_train_ngrams = count_vect.fit_transform(X_train_text) # cuenta frecuencia de tokens y define el diccionario
print("numero de features=",X_train_ngrams.shape[1])

scores_cv_ngrams = cross_val_score(clf,X_train_ngrams, y_train,cv = 10, scoring='f1_macro')
print("f-score=",round(scores_cv_ngrams.mean(),4)," ( sd =",round(scores_cv_ngrams.std(),4),")")

In [0]:
count_vect.get_feature_names()[:20]

### Pruebo otros parametros

In [0]:
# incluyo bigramas (aclaracion:si aparece "best friend" tambien va a contar para "best" y para "friend")
count_vect = CountVectorizer(preprocessor=clean_text, 
                             tokenizer=tokenizer, 
                             min_df=30, 
                             stop_words=stoplist_tokenized,
                             ngram_range=(1,2))
X_train_ngrams = count_vect.fit_transform(X_train_text) # cuenta frecuencia de tokens y define el diccionario
print("numero de features =",X_train_ngrams.shape[1])

clf = MultinomialNB(alpha=1)
scores_cv_ngrams_v2 = cross_val_score(clf,X_train_ngrams, y_train,cv = 10, scoring='f1_macro')
print("f-score =",round(scores_cv_ngrams_v2.mean(),4)," ( sd =",round(scores_cv_ngrams_v2.std(),4),")")

## Selecciono el mejor modelo

In [0]:
pd.Series({'NB (mindf=5)':round(scores_cv_nb.mean(),3),'NB with ngrams (mindf=5)':round(scores_cv_ngrams.mean(),3),'NB with ngrams (mindf=30)':round(scores_cv_ngrams_v2.mean(),3)})

In [0]:
count_vect = CountVectorizer(preprocessor=clean_text, tokenizer=tokenizer, min_df=5,stop_words="english",ngram_range=(1,1))
X_train_ngrams = count_vect.fit_transform(X_train_text) # cuenta frecuencia de tokens y define el diccionario
X_test_ngrams = count_vect.transform(X_test_text) # cuenta frecuencia de tokens existentes en el diccionario
print("numero de features=",X_train_ngrams.shape[1])
# entreno el clasificador
clf = MultinomialNB(alpha=1).fit(X_train_ngrams, y_train)
# predigo en el set de testeo
y_pred = clf.predict(X_test_ngrams)
scores_ngrams = pd.Series(precision_recall_fscore_support(y_test, y_pred,average="macro")[:3],index=["precision","recall","fscore"])
print("matriz de confusión\n",confusion_matrix(y_test, y_pred))

print("\nNgrams\n")
print(scores_ngrams)

##  Normalización

In [0]:
# Stemizacion: lleva una palabra a su raíz (la cual puede no ser una palabra) 
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
stemmer.stem("explanation")

In [0]:
[(w,stemmer.stem(w)) for w in word_tokenize(clean_text(X_train_text.iloc[0])) if w.isalpha()]

#### notar que "empathy" y "empathize" no van a la misma raiz!

In [0]:
print(stemmer.stem("empathy"))
print(stemmer.stem("empathize"))

In [0]:
# Lematizacion: lleva una palabra a su forma canónica
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
wnl = WordNetLemmatizer()

In [0]:
# para lemmatizar es necesario dar la categoria gramatical(part-of-speach)
# los POS validos son ("a","v","n","r")(adjetivo,verbo,sustantivo,advervio)
wnl.lemmatize(word="was",pos="v")

In [0]:
postags = pos_tag(word_tokenize(clean_text(X_train_text.iloc[1])))
postags

In [0]:
def word_pos_to_lemma(word,pos,wnl):
    if pos.startswith('J'):
        return wnl.lemmatize(word,wordnet.ADJ) # adjetivo
    elif pos.startswith('V'):
        return wnl.lemmatize(word,wordnet.VERB) # vervo
    elif pos.startswith('N'):
        return wnl.lemmatize(word,wordnet.NOUN)# sustantivo
    elif pos.startswith('R'):
        return wnl.lemmatize(word,wordnet.ADV) # advervio
    else:
        return wnl.lemmatize(word)

In [0]:
[(w,word_pos_to_lemma(w,pos,wnl) ) for w,pos in postags if w.isalpha()][:25]

## Zipf's Law

In [0]:
# Serie con todas las palabras del corpus
words_series = pd.Series([word for rev in df.review for word in word_tokenize(rev) if word.isalpha()])  

In [0]:
# Frecuencia de cada palabra en el corpus
word_freq = words_series.value_counts()
word_freq

In [0]:
# plot de frecuencia vs ranking (la palabra mas frecuente tiene rank=1, la segunda tiene rank=2, etc..)
plt.plot(range(len(word_freq)),word_freq)
plt.xlabel('rank');plt.ylabel('freq');


In [0]:
# ahora en eje logaritmico
plt.plot(range(len(word_freq)),word_freq)
plt.xlabel('rank');plt.ylabel('freq')
plt.xscale('log'); plt.yscale('log');

In [0]:
# palabras con 1 sola occurrencia
sum(word_freq==1)/len(word_freq)

# Ejercicio 1
## Armar un clasificador que identifique si una review de IMDB es positivo (1) o negativo (0)
## comparar varios modelos y seleccionar el que produzca un mejor 10-fold CV o un mejor performance en el dev set
## predecir en el test set y calcular la matriz de confusion, precision, recall y F-score

In [0]:
import tensorflow_datasets as tfds


In [0]:
# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

(train_data, validation_data), test_data = tfds.load(
    name="imdb_reviews", 
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True)

df_train = pd.DataFrame(list(tfds.as_numpy(train_data)),columns=['texto','clase'])
df_dev = pd.DataFrame(list(tfds.as_numpy(validation_data)),columns=['texto','clase'])
df_test = pd.DataFrame(list(tfds.as_numpy(test_data)),columns=['texto','clase'])

In [0]:
df_train.sample(5)

In [0]:
def b2str(b):
  return b.decode('utf8')

df_dev['texto'] = df_dev['texto'].apply(b2str)
df_train['texto'] = df_train['texto'].apply(b2str)
df_test['texto'] = df_test['texto'].apply(b2str)

In [0]:
df_train.sample(5)