In [6]:
import os
import pandas as pd
import numpy as np

path = '../data/trainset/'

dataset = {'polarity':[], 'bin_polarity': [], 'review':[], 'set':[]}

probability = 0.8

for product in os.listdir(path):
    for score in os.listdir(path + product):
        for file in os.listdir(path + product + "/" + score + "/"):
            if file.endswith('.txt'):
                with open(path + product + "/" + score + "/" + file) as text_file:
                    for line in text_file.readlines():
                        dataset['polarity'].append(float(score))
                        dataset['bin_polarity'].append(0 if float(score) < 3.0 else 1)
                        dataset['review'].append(line)
                        dataset['set'].append('train' if np.random.rand() < probability else 'test')

In [7]:
# create dataframe

dataframe = pd.DataFrame(data=dataset)
dataframe.groupby('set').count()

Unnamed: 0_level_0,bin_polarity,polarity,review
set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,58119,58119,58119
train,232229,232229,232229


In [9]:
import re
words = []

with open('../models/LIWC2007_Portugues_win.dic.txt', 'r', encoding='latin') as liwc_file:
    in_header = True
    for line in liwc_file.readlines():
        if not re.match('^\d+', line):
            parts = line.split()
            word = parts.pop(0)
            if '126' in parts or '127' in parts:
                words.append(word)

In [12]:
from nlputils.lexical import Preprocessing
import spacy
spacy_nlp = spacy.load('../models/pt_core_news_sm-2.1.0')
stopwords = spacy.lang.pt.stop_words.STOP_WORDS
# print(stopwords)

cont = 0
for word in words:
    if word in stopwords:
        stopwords.remove(word)
        cont += 1

normalizer = Preprocessing()

def preprocessing(text):
    text = normalizer.lowercase(text)
    text = normalizer.remove_punctuation(text)
    tokens = normalizer.tokenize_words(text)
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

In [13]:
dataframe['normalized_review'] = dataframe['review'].apply(preprocessing)
dataframe.head()

Unnamed: 0,bin_polarity,polarity,review,set,normalized_review
0,0,1.0,"Produto de Ótimo acabamento, e melhor custo be...",train,produto ótimo acabamento e melhor custo benefí...
1,0,1.0,\n,train,
2,0,1.0,"O que gostei: Compacta, design moderno\n",train,o gostei compacta design moderno
3,0,1.0,\n,train,
4,0,1.0,O que não gostei: não possue controle de tempe...,train,o gostei possue controle temperatura automático


## Feature Extraction

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [13]:
train_reviews = dataframe[dataframe['set'] == 'train']['normalized_review'].values.tolist()
train_classes = dataframe[dataframe['set'] == 'train']['polarity'].values.tolist()
test_reviews = dataframe[dataframe['set'] == 'test']['normalized_review'].values.tolist()
test_classes = dataframe[dataframe['set'] == 'test']['polarity'].values.tolist()

transformer = TfidfVectorizer()
transformer.fit(train_reviews)
X = transformer.transform(train_reviews)
X_test = transformer.transform(test_reviews)

# cv = CountVectorizer(binary=True)
# cv.fit(train_reviews)
# X = cv.transform(train_reviews)
# X_test = cv.transform(test_reviews)

## Import sklearn

In [14]:
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

## Logistic Regression

In [15]:
classifier = LogisticRegression(n_jobs=4)
classifier.fit(X, train_classes)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=4, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
accuracy_score(test_classes, classifier.predict(X_test))

0.45648275862068965

In [None]:
# svr = SVR()
# svr.fit(X, train_classes)

## MLP

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=(25, 2), n_jobs=4)
# clf = MLPClassifier(solver='sgd'), n_jobs=4))

mlp.learning_rate_initial = 1

In [None]:
clf.fit(X, train_classes)

In [None]:
accuracy_score(test_classes, knn.predict(X_test))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=3)
rf.fit(X, train_classes)

In [None]:
accuracy_score(test_classes, rf.predict(X_test))

## KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(15, weights='distance')
knn.fit(X, train_classes)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [11]:
accuracy_score(test_classes, knn.predict(X_test))

0.42149511864523104