In [1]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install -U nltk
!python -m spacy download en_core_web_sm
!pip install scikit-learn

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')




In [2]:
import numpy as np
import pandas as pd
import re
import spacy
import nltk

from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Análise do DataSet

In [3]:
imdb_df = pd.read_csv('..\data\imdb.csv', sep=';')
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imdb_df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [6]:
imdb_df['sentiment'].value_counts()

negative    25000
positive    24999
Name: sentiment, dtype: int64

In [None]:
imdb_df['processed_sentiment'] = imdb_df['sentiment'].apply(lambda sentiment: 1 if sentiment == 'positive' else 0)

### Sanitização dos dados de entrada

In [7]:
imdb_df['review'] = imdb_df['review'].apply(lambda x: re.sub("<[^>]*>", "",  x)) #Remove HTML tags
imdb_df['review'] = imdb_df['review'].apply(lambda x: re.sub("[^0-9a-zA-Z ]+", "",  x)) #Remove any none alphanumeric charather, less space
imdb_df['review'] = imdb_df['review'].apply(lambda x: x.lower()) #Lower case

### Tokenização e lemetização com Spacy

In [9]:
nlp = spacy.load("en_core_web_sm")
pos_filter = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'] #adjective, adverb, interjection, noun, pronoun, verd

In [10]:
def process_review(review: str) -> str:
    processed_words = nlp(review)
    return [word.lemma_.strip().lower() for word in processed_words if not word.is_stop and word.pos_ in pos_filter]

### Criação do modelo

In [11]:
tfidf = TfidfVectorizer(tokenizer = process_review)
classifier = LinearSVC()

In [12]:
X = imdb_df['review']
y = imdb_df['processed_sentiment']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((39999,), (10000,))

In [14]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function process_review at 0x00000106A18F5790>)),
                ('clf', LinearSVC())])

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(accuracy_score(y_test, y_pred))

0.8895


### Comparação com SentimentIntensityAnalyzer do NLTK

In [17]:
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(imdb_df['review'][0])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Johnn\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'neg': 0.199, 'neu': 0.744, 'pos': 0.057, 'compound': -0.9947}

In [18]:
sia_y_pred = np.array([1 if sia.polarity_scores(x_test)['compound'] >= 0 else 0 for x_test in X_test])

In [19]:
print(accuracy_score(y_test, sia_y_pred))

0.692


Acurácia do modelo criado utilizando Spacy para tokenização e SVM para o modelo preditivo: 0.8895
Acurácia do modelo criado utilizando SentimentIntensityAnalyzer do nltk: 0.692