# Clonar repositorio

In [1]:
!git clone https://github.com/joanby/machinelearning-az.git

Cloning into 'machinelearning-az'...
remote: Enumerating objects: 10505, done.[K
remote: Total 10505 (delta 0), reused 0 (delta 0), pack-reused 10505[K
Receiving objects: 100% (10505/10505), 311.57 MiB | 22.39 MiB/s, done.
Resolving deltas: 100% (220/220), done.
Checking out files: 100% (10250/10250), done.


# Instalar sklearn

In [2]:
!pip install sklearn



# Importar las librerías


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importar el dataset


In [4]:
dataset = pd.read_csv("https://raw.githubusercontent.com/joanby/machinelearning-az/master/datasets/Part%207%20-%20Natural%20Language%20Processing/Section%2036%20-%20Natural%20Language%20Processing/Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)

# Limpieza de texto

In [5]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Crear el Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Dividir el data set en conjunto de entrenamiento y conjunto de testing

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Predicción con distintos algoritmos de clasificación

## Predicción de los resultados con el Conjunto de Testing con Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred  = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.73
Precision = 0.57
Recall = 0.82
F1 Score = 0.67


## Predicción de los resultados con el Conjunto de Testing con Logistic Regression

In [9]:
# Ajustar el modelo de Regresion Logistica en el Conjunto de Entrenamiento
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.71
Precision = 0.78
Recall = 0.67
F1 Score = 0.72


## Predicción de los resultados con el Conjunto de Testing con K-NN

In [10]:
# Ajustar el K-NN en el Conjunto de Entrenamiento
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.67
Precision = 0.82
Recall = 0.62
F1 Score = 0.70


## Predicción de los resultados con el Conjunto de Testing con SVM

In [11]:
# Ajustar el SVM en el Conjunto de Entrenamiento
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.72
Precision = 0.76
Recall = 0.69
F1 Score = 0.73


## Predicción de los resultados con el Conjunto de Testing con Kernel SVM

In [12]:
# Ajustar el clasificador en el Conjunto de Entrenamiento
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.73
Precision = 0.93
Recall = 0.66
F1 Score = 0.77


## Predicción de los resultados con el Conjunto de Testing con Decission Tree

In [13]:
# Ajustar el clasificador de Arbol de Decision en el Conjunto de Entrenamiento
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.71
Precision = 0.76
Recall = 0.68
F1 Score = 0.72


## Predicción de los resultados con el Conjunto de Testing con Random Forest

In [14]:
# Ajustar el clasificador de Random Forest en el Conjunto de Entrenamiento
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Prediccion de los resultados con el Conjunto de Testing
y_pred = classifier.predict(X_test)

# Elaborar una matriz de confusion
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

TP = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TN = cm[1][1]

accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

print('Accuracy = {:.2f}'.format(accuracy))
print('Precision = {:.2f}'.format(precision))
print('Recall = {:.2f}'.format(recall))
print('F1 Score = {:.2f}'.format(2*precision*recall/(precision+recall)))

Accuracy = 0.72
Precision = 0.90
Recall = 0.65
F1 Score = 0.76


In [32]:
classifier = nltk.classify.MaxentClassifier.train(train_toks = X_train, labels = y_train)

ValueError: ignored

In [26]:
y_train

array([1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,

In [29]:
X_train[0]

array([0, 0, 0, ..., 0, 0, 0])

## Tabla de resultados

|                     | Accuracy | Precission | Recall | F1 Score |
|---------------------|:--------:|:----------:|:------:|:--------:|
| Naïve Bayes         |   0.73   |    0.57    |  0.82  |   0.67   |
| Logistic Regression |   0.71   |    0.78    |  0.67  |   0.72   |
| K-NN                |   0.67   |    0.82    |  0.62  |   0.70   |
| SVM                 |   0.72   |    0.76    |  0.69  |   0.73   |
| Kernel SVM          |   0.73   |    0.93    |  0.66  |   0.77   |
| Decission Tree      |   0.71   |    0.76    |  0.68  |   0.72   |
| Random Forest       |   0.72   |    0.90    |  0.65  |   0.76   |

Mejor resultado obtenido: **SVM con radial basis function kernel**