# Sentiment analysis
Read https://web.stanford.edu/~jurafsky/slp3/19.pdf

### Using Lexicon

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

In [None]:
from nlputils.lexical import Preprocessing
from nltk.corpus import stopwords
import re

stopwords = stopwords.words('portuguese')
normalizer = Preprocessing()

In [None]:
# download LIWC resource at http://143.107.183.175:21380/portlex/images/arquivos/liwc/LIWC2007_Portugues_win.dic.txt
# posemo = 126
# negemo = 127
# what more?
positives = []
negatives = []

with open('../models/LIWC2007_Portugues_win.dic.txt', 'r', encoding='latin') as liwc_file:
    in_header = True
    for line in liwc_file.readlines():
        if not re.match('^\d+', line):
            parts = line.split()
            word = parts.pop(0)
            #rotulos de palavras positivoas
            if '126' in parts:
                positives.append(word)
            #rotulos de palavras negativas
            elif '127' in parts:
                negatives.append(word)

In [None]:
#utilizar outros rotulos alem dos anteriores que possam incorporar palavras positivas e negativas
#tratar palavras com '*'

In [None]:
'otimamente' in positives

In [None]:
'triste' in negatives

In [None]:
def lexical_sentment_analysis(text, binary=False):
    text = normalizer.remove_punctuation1(text)
    #print("texto", text)
    tokens = normalizer.tokenize_words(text)
    #print("tokens", tokens)
    tokens = normalizer.remove_stopwords(tokens)
    
    polarity = 0
    
    for token in tokens:
        if token in positives:
            polarity += 1
        elif token in negatives:
            polarity -= 1
    if not binary:
        return polarity
    else:
        if polarity < 0:
            polarity = -1
        elif polarity > 0:
            polarity = 1
        
        return polarity

In [None]:
lexical_sentment_analysis('Eu estou muito triste e triste')

### Training a classifier

In [None]:
# using dataset of IMDb, available at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import wget
import os
import tarfile
import pandas as pd

filename = "../dataset/aclImdb_v1.tar.gz"


# Donwload data

dataset_link = "http://ai.stanford.edu/~amaas/data/sentiment/{}".format("aclImdb_v1.tar.gz")
try:
    os.mkdir("../dataset")
except OSError:
    pass

if not os.path.isfile(filename):
    file = wget.download(dataset_link, out='../dataset/aclImdb_v1.tar.gz')
    tar = tarfile.open(filename, "r:gz")
    tar.extractall("../dataset")
    tar.close()


# read data

dataset_path = '../dataset/aclImdb'
train_positive_files = ['train/pos/'+f for f in os.listdir(dataset_path+'/train/pos') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/pos', f))]

train_negative_files = ['train/neg/'+f for f in os.listdir(dataset_path+'/train/neg') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/neg', f))]

test_positive_files = ['test/pos/'+f for f in os.listdir(dataset_path+'/test/pos') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/pos', f))]

test_negative_files = ['test/neg/'+f for f in os.listdir(dataset_path+'/test/neg') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/neg', f))]

all_files = list(set().union(train_positive_files,train_negative_files, test_positive_files, test_negative_files))

dataset = {'trainset':[], 'polarity':[], 'bin_polarity': [], 'review':[]}

for file in all_files:
    polarity = file.split('.')[0].split('_')[1]
    with open(os.path.join(dataset_path, file), 'r') as text_file:
        dataset['trainset'].append(file.split('/')[0])
        bin_polarity = 1 if int(polarity) > 5 else 0  # transform into binary polarity
        dataset['bin_polarity'].append(bin_polarity)
        dataset['polarity'].append(polarity)
        dataset['review'].append(text_file.readlines()[0])

        
# create dataframe

dataframe = pd.DataFrame(data=dataset)
dataframe.head()

#### Preprocessing

In [None]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

def preprocessing(text):
    text = normalizer.lowercase(text)
    text = normalizer.remove_punctuation(text)
    tokens = normalizer.tokenize_words(text)
    tokens = [token for token in tokens if token not in english_stopwords]
    return ' '.join(tokens)

dataframe['normalized_review'] = dataframe['review'].apply(preprocessing)
dataframe.head()

### feature extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
train_reviews = dataframe[dataframe['trainset'] == 'train']['normalized_review'].values.tolist()
train_classes = dataframe[dataframe['trainset'] == 'train']['bin_polarity'].values.tolist()
test_reviews = dataframe[dataframe['trainset'] == 'test']['normalized_review'].values.tolist()
test_classes = dataframe[dataframe['trainset'] == 'test']['bin_polarity'].values.tolist()

transformer = TfidfVectorizer()
transformer.fit(train_reviews)
X = transformer.transform(train_reviews)
X_test = transformer.transform(test_reviews)

In [None]:
#tentar abordagem de aprendizado utilizando os reviews positivos e negativos passando para o algoritmo
#aprender com base no w2v

### training classifier

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
#classifier = SVC(kernel='linear')
classifier = SVC()
classifier.fit(X, train_classes)

In [None]:
accuracy_score(test_classes, classifier.predict(X_test))

In [None]:
classifier_lr = LogisticRegression()
#n_jobs=-1
classifier_lr.fit(X, train_classes)

In [None]:
accuracy_score(test_classes, classifier_lr.predict(X_test))

# using the classifier

In [None]:
sentence = "This film was really bad!"
preprocessed_sentence = preprocessing(sentence)
print(preprocessed_sentence)
instance = transformer.transform([preprocessing(sentence)])
#predicao de valor 0 tem polaridade negativa
print(instance)
classifier.predict(instance)

In [None]:
sentence = "Good film!"
preprocessed_sentence = preprocessing(sentence)
print(preprocessed_sentence)
instance = transformer.transform([preprocessing(sentence)])
print(instance)
classifier.predict(instance)