In [138]:
import pandas as pd

# Task 1: Cleaning the dataframe
##### - Create column 'avaliacoes'
##### - Drop NA values

In [133]:
def get_sentiment(rating):
    if rating < 3:
        return 'negativo'
    elif rating == 3:
        return 'neutro'
    elif rating > 3:
        return 'positivo'

df = pd.read_csv('./dataset/avaliacoes.csv')
df['sentiment'] = df.rating.apply(lambda rating : get_sentiment(rating))
df.head()

Unnamed: 0,review_text,rating,sentiment
0,"Estou muito satisfeito, o visor é melhor do qu...",4,positivo
1,"""muito boa\n\nO que gostei: preco\n\nO que não...",5,positivo
2,"Rápida, ótima qualidade de impressão e fácil d...",5,positivo
3,Produto de ótima qualidade em todos os quesito!,5,positivo
4,Precisava comprar uma tv compatível com meu dv...,5,positivo


In [140]:
print('Before drop NA:')
print(df.count())
#there is 1 missing value at 'review_text'
df.dropna(inplace=True)
print('-'*10)
print('After drop NA:')
print(df.count())

Before drop NA:
review_text    84990
rating         84991
sentiment      84991
dtype: int64
----------
After drop NA:
review_text    84990
rating         84990
sentiment      84990
dtype: int64


### Subtask 2: Evenly distributed dataframe
##### - Same quantity of positive, negative and neutral reviews

In [144]:
evenly_distrubute_quantity = df[df.sentiment == 'neutro'].count()[0]

evenly_distrubute_quantity
df_1 = df.groupby('sentiment', as_index=False, group_keys=False).apply(lambda x: x.sample(n=evenly_distrubute_quantity, random_state=1, replace=True))
df_1.head()

Unnamed: 0,review_text,rating,sentiment
63854,"Pense num arrependimento, samsung nunca mais. ...",2,negativo
2889,"NÃO SEI O QUE MAIS GOSTO, POIS TIREI-O DA CAIX...",2,negativo
49371,TENHO UM REFRIGERADOR SEXTO SENTIDO PAGUEI r$ ...,1,negativo
64356,"Aparentemente um produto muito bom, abrange as...",1,negativo
11507,"Um produto com pouca Qualidade, mas ideal pra ...",2,negativo


### Subtask 3: Review text to lower case

In [147]:
df_1['review_text'] = df_1['review_text'].str.lower()
df_1.head()

Unnamed: 0,review_text,rating,sentiment
63854,"pense num arrependimento, samsung nunca mais. ...",2,negativo
2889,"não sei o que mais gosto, pois tirei-o da caix...",2,negativo
49371,tenho um refrigerador sexto sentido paguei r$ ...,1,negativo
64356,"aparentemente um produto muito bom, abrange as...",1,negativo
11507,"um produto com pouca qualidade, mas ideal pra ...",2,negativo


# Task 2: Stopwords removal and stemming

In [176]:
import nltk

# nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iago1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\iago1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


True

In [173]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

def get_tokenized(text):
    return word_tokenize(text)

### Stopwords
##### stopwords are words that we want to ignore because they dont altere the meaning of the phrase.

In [191]:
stop_words = stopwords.words('portuguese')

stop_words.remove('não')
# the word 'não' is very important to us

def get_text_stopworded(text):
    text_tokenized = get_tokenized(text)
    
    text_stopworded = []
    
    for word in text_tokenized:
        if word not in stop_words:
            text_stopworded.append(word)
    
    text_stopworded = (" ").join(text_stopworded)
    
    return text_stopworded

### Stemming
##### Stemming is the process of reducing inflected words. For example, the words "likes" and "liked" will be transformed into 'like" and our program will consider as the same meaning.

In [192]:
def get_text_stemmed(text):
    text_tokenized = get_tokenized(text)
    
    text_stemmed = []
    stemmer = RSLPStemmer()

    for word in text_tokenized:
        word_stemmed = stemmer.stem(word)
        text_stemmed.append(word_stemmed)
    
    text_stemmed = (" ").join(text_stemmed)
    
    return text_stemmed

### Getting the stopworded, stemmed and stopworded/stemmed columns to compare the accuracy of our model

In [193]:
df_2 = df_1.copy()

df_2['review_text_stopworded'] = df_2['review_text'].apply(lambda rev: get_text_stopworded(rev))
df_2['review_text_stemmed'] = df_2['review_text'].apply(lambda rev: get_text_stemmed(rev))
df_2['review_text_stopworded_stemmed'] = df_2['review_text_stopworded'].apply(lambda rev: get_text_stemmed(rev))
df_2.head()

Unnamed: 0,review_text,rating,sentiment,review_text_stopworded,review_text_stemmed,review_text_stopworded_stemmed
63854,"pense num arrependimento, samsung nunca mais. ...",2,negativo,"pense arrependimento , samsung nunca . querem ...","pens num arrepend , samsung nunc mais . quer c...","pens arrepend , samsung nunc . quer celul bom ..."
2889,"não sei o que mais gosto, pois tirei-o da caix...",2,negativo,"não sei gosto , pois tirei-o caixa não funcion...","não sei o que mais gost , poi tirei- da caix e...","não sei gost , poi tirei- caix não func aquec ..."
49371,tenho um refrigerador sexto sentido paguei r$ ...,1,negativo,"refrigerador sexto sentido paguei r $ 3.500,00...","tenh um refriger sext sent pag r $ 3.500,00 e ...","refriger sext sent pag r $ 3.500,00 deu defeit..."
64356,"aparentemente um produto muito bom, abrange as...",1,negativo,"aparentemente produto bom , abrange principais...","aparent um produt muit bom , abrang as princip...","aparent produt bom , abrang princip falh máqui..."
11507,"um produto com pouca qualidade, mas ideal pra ...",2,negativo,"produto pouca qualidade , ideal pra poucos rec...","um produt com pouc qual , mas ideal pra qu tem...","produt pouc qual , ideal pra pouc recurs porqu..."


#### Exporting the data

In [135]:
df_2.to_csv('./dataset/avaliacao_limpa.csv')

# Task 3: Creating the model

In [134]:
import pandas as pd
df_2 = pd.read_csv('./dataset/avaliacao_limpa.csv')

### Separating train and test data

In [95]:
from sklearn.model_selection import train_test_split
y = df_2['sentiment']

X = df_2['review_text']
X_stopworded = df_2['review_text_stopworded']
X_stemmed = df_2['review_text_stemmed']
X_stopworded_stemmed = df_2['review_text_stopworded_stemmed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_stopworded, X_test_stopworded, y_train, y_test = train_test_split(X_stopworded, y, test_size=0.2, random_state=42)
X_train_stemmed, X_test_stemmed, y_train, y_test = train_test_split(X_stemmed, y, test_size=0.2, random_state=42)
X_train_stopworded_stemmed, X_test_stopworded_stemmed, y_train, y_test = train_test_split(X_stopworded_stemmed, y, test_size=0.2, random_state=42)

### Vectorizing the model

### Bag of words

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)

X_train_vectors = vectorizer.fit_transform(X_train)
# Quantity of words we have
len(vectorizer.get_feature_names())

32236

##### Without data treatment

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)

X_train_vectors = vectorizer.fit_transform(X_train)
# Quantity of words we have
len(vectorizer.get_feature_names())

clf_log = LogisticRegression(max_iter=300)
clf_log.fit(X_train_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [83]:
X_test_vectors = vectorizer.transform(X_test)
clf_log.score(X_test_vectors, y_test)

0.7724006452559026

##### Stopworded

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_sw = CountVectorizer(binary=True)

X_train_stopworded_vectors = vectorizer_sw.fit_transform(X_train_stopworded.values.astype('U'))

clf_log_sw = LogisticRegression(max_iter=300)
clf_log_sw.fit(X_train_stopworded_vectors, y_train_stopworded)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [101]:
X_test_stopworded_vectors = vectorizer_sw.transform(X_test_stopworded.values.astype('U'))
clf_log_sw.score(X_test_stopworded_vectors, y_test)

0.7652148408857604

In [52]:
X_test_stopworded_vectors = vectorizer.transform(X_test_stopworded.values.astype('U'))
clf_log.score(X_test_stopworded_vectors, y_test)x

0.7542161607273794

##### Stemmed

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_st = CountVectorizer(binary=True)

X_train_stemmed_vectors = vectorizer_st.fit_transform(X_train_stemmed)

clf_log_st = LogisticRegression(max_iter=300)
clf_log_st.fit(X_train_stemmed_vectors, y_train_stemmed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [87]:
X_test_stemmed_vectors = vectorizer_st.transform(X_test_stemmed)
clf_log_st.score(X_test_stemmed_vectors, y_test_stemmed)

0.738084763161754

##### Stopworded & Stemmed

In [108]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_sw_st = CountVectorizer(binary=True)

X_train_stopworded_stemmed_vectors = vectorizer_sw_st.fit_transform(X_train_stopworded_stemmed.values.astype('U'))

clf_log_sw_st = LogisticRegression(max_iter=300)
clf_log_sw_st.fit(X_train_stopworded_stemmed_vectors, y_train_stopworded_stemmed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=300)

In [118]:
X_test_stopworded_stemmed_vectors = vectorizer_sw_st.transform(X_test_stopworded_stemmed.values.astype('U'))

clf_log_sw_st.score(X_test_stopworded_stemmed_vectors, y_test_stopworded_stemmed)

0.7344185364422936