In [None]:
!pip install py3langid

Collecting py3langid
  Downloading py3langid-0.2.2-py3-none-any.whl (750 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m


In [None]:
import pandas as pd
import json
from nltk.corpus import PlaintextCorpusReader
import nltk
nltk.download(
    ['all'])

In [None]:
!pip install nltk

In [None]:
df = pd.read_csv("/content/winemag-data-130k-v2.csv")

**PROCESAMIENTO DE LA DATA**

In [None]:
df

In [None]:
df["points"].hist()

In [None]:
df.isnull().sum()

In [None]:
df["quality"] = df["points"].map(lambda x: "Positive" if x > 91 else "Negative")

In [None]:
df["quality"].value_counts()

In [None]:
df["quality"].value_counts().plot.bar()

In [None]:
df[df["quality"] == "Positive"].sample(15000)

In [None]:
Negative = df[df["quality"] == "Negative"].sample(15000)
Positive = df[df["quality"] == "Positive"].sample(15000)

In [None]:
df = pd.concat([Negative, Positive])
df = df.sample(frac=1).reset_index(drop=True) #Dessordenar las filas

In [None]:
df['quality'].value_counts().plot.bar()

In [None]:
#Estudio del lenguaje

import py3langid as langid

df["Language"] = df['description'].apply(lambda x : langid.classify(x)[0])
#función lambda se utliza para crear una nueva columna de marco de datos con etiquetas de idioma

In [None]:
#Nos muestra las descripciones que no estan en english
df[df["Language"] != "en"][["Language", "description"]]

In [None]:
dff = df [df["Language"] == "en"][["description", "quality"]]

In [None]:
dff

**PROCESAMIENTO DEL TEXTO**

In [None]:
#Descargamos las librerias
import nltk
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download([
    "stopwords", #las stopwords
     "names",    #los nombres
     "vader_lexicon",
     "punkt",
     "wordnet" ])

In [None]:
#Obtener las Stopwords del ingles y los names
stopwords = nltk.corpus.stopwords.words("english")
names = nltk.corpus.names.words()

In [None]:
stopwords

In [None]:
names

In [None]:

def get_tokens(series, reduce):
  #reducer es una función que lematiza o deriva el token


    vocabulary = []
    for comment in series:
        for idx, word in enumerate(nltk.word_tokenize(comment)):
            if not word.isalpha(): continue  #las comas, puntos, signos etc
            if word in stopwords: continue
            if word not in names: word = word.lower()
            vocabulary.append(reduce(word))


    return vocabulary

In [None]:
lemmatizer = WordNetLemmatizer()
get_tokens(dff["description"][:1], lemmatizer.lemmatize)

In [None]:
#Obtener Vocabulario
vocabulary = get_tokens(dff["description"][:],lemmatizer.lemmatize )

In [None]:
vocabulary = list(set(vocabulary))

In [None]:
#Obtenemos los onigramas
fd = nltk.FreqDist(vocabulary)
fd.tabulate(10)

In [None]:
#Obtenemos los bigramas

finder = nltk.collocations.BigramCollocationFinder.from_words(vocabulary)
finder.ngram_fd.tabulate(10)

In [None]:
#obtenemos los trigramas

finder = nltk.collocations.TrigramCollocationFinder.from_words(vocabulary)
finder.ngram_fd.tabulate(10)

In [None]:
#obtenemos los quadigramas

finder = nltk.collocations.QuadgramCollocationFinder.from_words(vocabulary)
finder.ngram_fd.tabulate(10)

In [None]:
# Aqui buscamos las caracteristicas deseas y las no deseadas

unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()]) #El método w.lower devuelve una cadena donde todos los caracteres están en minúsculas.

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_description = dff[dff["quality"] == "Positive"]["description"]
negative_description = dff[dff["quality"] == "Negative"]["description"]

positive_tokens = get_tokens(positive_description,lemmatizer.lemmatize)
negative_tokens = get_tokens(negative_description,lemmatizer.lemmatize)

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(positive_tokens)
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(negative_tokens)
)]

In [None]:
positive_words

In [None]:
negative_words

In [None]:
from pandas.core import common
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_200_positive = {word for word, count in positive_fd.most_common(200)}
top_200_negative = {word for word, count in negative_fd.most_common(200)}

In [None]:
import pickle
#Pickle s el proceso de convertir un objeto de Python en un flujo de bytes
#para almacenarlo en un archivo/base de datos
f = open('top_200_positive.pickle', 'wb')
pickle.dump(top_200_positive, f) #Pickle se utliza para almacenar
f.close()

f = open('top_200_negative.pickle', 'wb')
pickle.dump(top_200_negative, f)
f.close()

In [None]:
top_200_positive

In [None]:
top_200_negative

**NLTK Pretrained Sentiment Analyzer**

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#Es el proceso de determinar 'computacionalmente' si un comentario es positivo o negativo
sia = SentimentIntensityAnalyzer() #Inizializando Sentiment Intensity Analyzer

In [None]:
dff["quality"].value_counts()

In [None]:
dff["quality_Estimate"] = dff["description"].map(lambda x : max(sia.polarity_scores(x), key=sia.polarity_scores(x).get))

In [None]:
dff[["description", "quality", "quality_Estimate"]]

**NLTK Naive Bayes Classifier**

In [None]:
from statistics import mean

def extract_features(text):

    vocabulary = []
    for idx, word in enumerate(nltk.word_tokenize(text)):
        if not word.isalpha(): continue
        if word in stopwords: continue
        word = word.lower()
        word = lemmatizer.lemmatize(word)
        if word in top_200_negative or top_200_positive:
            vocabulary.append(word)

    fd = nltk.FreqDist(vocabulary)

    return fd

In [None]:
positive_comments = dff[dff["quality"] == "Positive"]["description"].sample(200)
negative_comments = dff[dff["quality"] == "Negative"]["description"].sample(200)

features = [
    (extract_features(review), "La review es positiva")
    for review in positive_comments
]
features.extend([
    (extract_features(review), "La review es negativa")
    for review in negative_comments
])

In [None]:
features

In [None]:
from random import shuffle

train_count = len(features)//2
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])#usamos esta funcion para entrenar el aloritmo de NaiveBayes
classifier.show_most_informative_features()
#Un clasificador basado en el algoritmo Naive Bayes. Se utliza para encontrar la probabilidad de una etiqueta en este caso de las palabras

In [None]:
#Comprovando que tan efectivo es

nltk.classify.accuracy(classifier, features[train_count:])

In [None]:
# Prueba con datos que no se han visto

review = "i love it"
classifier.classify(extract_features(review))

**Scikit-Learn Naive Bayes Classifier**

In [None]:
from sklearn.naive_bayes import (#Aqu1 se importaron los classifier
    BernoulliNB,
    ComplementNB,
    MultinomialNB
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=30000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [None]:
train_count = len(features) // 4
shuffle(features)

trained_classifiers = {}

for name, sklearn_classifier in classifiers.items():
     classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
     classifier.train(features[:train_count])
     accuracy = nltk.classify.accuracy(classifier, features[train_count:])
     trained_classifiers[name] = classifier
     print(F"{accuracy:.2%} - {name}")

In [None]:
# Dependiendo de su calificacion escojo el que optenga la mas alta

import pickle
f = open('quality_classifier.pickle', 'wb')
pickle.dump(trained_classifiers["LogisticRegression"], f)
f.close()

In [None]:
# Hago una prueva con el casificador con la notas mas alta

f = open('quality_classifier.pickle', 'rb')
deployed_classifier = pickle.load(f)
f.close()

#Estima la probabilidad de que ocurra un evento, como votar o no votar,
#en función de un conjunto de datos determinado de variables independientes.

In [None]:
deployed_classifier.classify(extract_features("it was so hoorrible i did not like it"))