In [1]:
# My imports

%matplotlib inline

import nltk
import pickle
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk import tokenize, FreqDist
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

ModuleNotFoundError: No module named 'seaborn'

In [None]:
reviews = pd.read_csv('/data/imdb-reviews-pt-br.csv')

In [None]:
reviews.head()

In [None]:
print('This is a negative review:\n\n', reviews.text_pt[0])

In [None]:
reviews.sentiment.unique()

In [None]:
reviews.sentiment.value_counts()

In [None]:
reviews['binary_sentiment_repres'] = reviews.sentiment.replace(['neg', 'pos'], [0,1])
reviews.head()

In [None]:
vectorizer = CountVectorizer(max_features=100)

In [None]:
# If you want to put everything on a sparse DataFrame, but is not a good idea!
# bow = vectorizer.fit_transform(reviews.text_pt)
# df_bow = pd.DataFrame.sparse.from_spmatrix(bow, columns=vectorizer.get_feature_names())

In [None]:
whiteSpaceTokenizer = tokenize.WhitespaceTokenizer()

def get_occurs_df(column):
    all_words = ' '.join([item for item in reviews[column]])
    review_tokens = whiteSpaceTokenizer.tokenize(all_words)
    occurencies = FreqDist(review_tokens)
    return pd.DataFrame({
        'words': list(occurencies.keys()), 
        'occur': list(occurencies.values())
    })

In [None]:
def plot_samples(df, n=10):
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data=df.nlargest(columns='occur', n=n), x="words", y="occur", color='green')
    ax.set(ylabel='Count')
    plt.show()

In [None]:
df_occur = get_occurs_df('text_pt')
plot_samples(df_occur, n=15)

In [None]:
nltk.download('stopwords')
irrelevants = stopwords.words("portuguese")

processed_sentece = list()
for each in reviews.text_pt:
    filtered_sentence = list()
    wordish = whiteSpaceTokenizer.tokenize(each)
    for item in wordish:
        if item not in irrelevants:
            filtered_sentence.append(item)
    processed_sentece.append(' '.join(filtered_sentence))

reviews['preprocess_1'] = processed_sentece    

In [None]:
reviews.head()

In [None]:
df_occur = get_occurs_df('preprocess_1')
plot_samples(df_occur, n=15)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

bow = vectorizer.fit_transform(reviews.preprocess_1)
print("Bag of words shape", bow.shape)

trX, teX, trY, teY = train_test_split(bow, reviews.binary_sentiment_repres.values.reshape(-1, 1), random_state=9)
print("Train and Test X shapes:", trX.shape, teX.shape)
print("Train and Test Y shapes:", trY.shape, teY.shape)

regressor = LogisticRegression(solver="lbfgs")
regressor.fit(trX, trY.ravel())
print("acc:", regressor.score(teX, teY.ravel()))

In [None]:
def make_a_prediction(phrase):
    return 'Positive' if regressor.predict(vectorizer.transform([phrase])) else 'Negative'

In [None]:
while True:
    phrase = input('Digite o que achou do Filme "De volta para o Futuro": ')
    print(make_a_prediction(phrase))

https://scikit-learn.org/stable/modules/model_persistence.html#persistence-example

In [None]:
from joblib import dump, load
dump(regressor, 'sklearn-logistc-regressor-model.joblib') 

In [None]:
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))