**Montamos la unidad Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Leemos el IMDB Dataset of 50K Movie Reviews**

In [13]:
import pandas as pd , numpy as np

#imdb = pd.read_csv('IMDB_prepro.csv')
imdb = pd.read_csv('/content/drive/MyDrive/IMDB_prepro.csv')

**Dividimos las muestras en dos particiones: 80% train / 20 % test**

In [14]:
imdb_train=imdb.review[:40000]
imdb_test=imdb.review[40000:]

**Transformamos las etiquetas de clase a valores numéricos y hacemos la misma partición train/test**

In [15]:
from sklearn.preprocessing import LabelBinarizer
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb['sentiment'])
#Spliting the sentiment data
train_labels=sentiment_data[:40000]
test_labels=sentiment_data[40000:]

**Representamos las muestras utilizando el modelo bag of words**

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
#Count vectorizer for bag of words
#cv=CountVectorizer(min_df=0,max_df=1,binary=False,max_features=None,ngram_range=(1, 3))
cv=CountVectorizer(min_df=0,max_df=1000000,binary=False,max_features=None,ngram_range=(1, 3))
#cv=CountVectorizer(min_df=0,max_df=1000000,binary=False,max_features=20,ngram_range=(1, 1))
#transformed train reviews
cv_imdb_train=cv.fit_transform(imdb_train)
#transformed test reviews
cv_imdb_test=cv.transform(imdb_test)

**Entrenamos un clasificador Naive Bayes multinomial**

In [25]:
from sklearn.naive_bayes import MultinomialNB
imdb_mnb=MultinomialNB()
imdb_mnb_bow=imdb_mnb.fit(cv_imdb_train,train_labels.ravel())

**Evaluamos el modelo sobre el test**

**1. Matriz de confusión**

In [None]:
from sklearn.metrics import confusion_matrix
imdb_mnb_bow_predict=imdb_mnb_bow.predict(cv_imdb_test)
imdb_cm_bow=confusion_matrix(test_labels,imdb_mnb_bow_predict,labels=[1,0])
print(imdb_cm_bow)

**2. Error de clasificación**

In [None]:
from sklearn.metrics import accuracy_score
imdb_mnb_bow_score=accuracy_score(test_labels,imdb_mnb_bow_predict)
print("mnb_bow_score :",imdb_mnb_bow_score)

**3. Curva ROC**

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

imdb_mnb_bow_probs = imdb_mnb_bow.predict_proba(cv_imdb_test)[:,1];
fpr, tpr, _ = metrics.roc_curve(test_labels,imdb_mnb_bow_probs,pos_label=1);

plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

**4. Area Under the ROC Curve (AUC)**

In [None]:
auc = metrics.roc_auc_score(test_labels,imdb_mnb_bow_probs);
print("Dev AUC: %.2f%%" % (auc*100));

**5. Medidas Precision, Recall y f1-score**

In [None]:
from sklearn.metrics import classification_report
imdb_mnb_bow_report=classification_report(test_labels,imdb_mnb_bow_predict,target_names=['Positive','Negative'])
print(imdb_mnb_bow_report)

**6. Curva Precision-Recall**

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (5,5)

fpr, tpr, _ = metrics.precision_recall_curve(test_labels,imdb_mnb_bow_probs,pos_label=1);

plt.plot(fpr,tpr)
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()