In [13]:
import spacy
from spacy.lang.pt.examples import sentences
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#-----------------------------------------------Normalização----------------------------------------------------#

def setup_abbr():
    file = open("dic_portuguese.txt", encoding='utf-8')
    abbr_dict = {}

    for line in file:
        w = line.split(";")
        abbr_dict[w[0]] = w[1].replace("\n", "")
    file.close()

    return abbr_dict

def lemmatizer(doc_corrected):
    lemma_sentence = []
    for token in doc_corrected: 
        if token.pos_ == 'VERB':
            lemma = token.lemma_
            lemma_sentence.append(lemma)
        else:
            lemma_sentence.append(token.text)
    lemmatized_sentence = ' '.join(lemma_sentence)
    return lemmatized_sentence

def remove_stopword(lemmatized_sentence): 
    stop_words = set(stopwords.words('portuguese')+ \
    ["{user}", "{url}", "<br/>", "myfitnesspal", "sigaa", "neste"]) 
    stop_words.remove("não")
    stop_words.remove("sem")
    word_tokens = word_tokenize(lemmatized_sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    result = ' '.join(filtered_sentence)
    return result

data = pd.read_csv('tst.csv')
nlp = spacy.load('pt_core_news_sm')
cln = []
abbr_dict = setup_abbr()

for i in range(len(data)):
    doc = nlp(data['comments'][i])
    doc_lower = doc.text.lower()
    doc_punctuation = re.sub('[^a-zãàáâéêíõóôúç \n]', ' ', doc_lower)
    doc_corrected = nlp(" ".join([abbr_dict.get(w, w) for w in doc_punctuation.split()])) 
    lemmatized_sentence = lemmatizer(doc_corrected)
    result = remove_stopword(lemmatized_sentence)
    cln.append(result)
clean = pd.DataFrame(data=np.array(cln), columns= ['CommentClean'])

#-----------------------------------------------Classificação---------------------------------------------------#

def pattern1(pru, tags1, tags2, h3):
    functionality = ''
    j = 0
    while (j < (len(pru)-3)):
        if pru[j].pos_ in tags1 and pru[j+1].pos_ in tags2 and pru[j+2].pos_ == 'VERB' and pru[j+3].pos_ == 'NOUN':
            func = [pru[j+2].text, pru[j+3].text]
            functionality = ' '.join(func)
            h3 = h3 + 1
            return functionality, h3
        else:
            j = j + 1
    return functionality, h3

def pattern2(pru, tags3, tags4, h1):
    functionality = ''
    j = 0
    while (j < (len(pru)-3)):
        if pru[j].pos_ in tags3 and pru[j+1].pos_ in tags4 and pru[j+2].pos_ == 'VERB' and pru[j+3].pos_ == 'NOUN':
            func = [pru[j+2].text, pru[j+3].text]
            functionality = ' '.join(func)
            h1 = h1+ 1
            return functionality, h1
        else:
            j = j + 1
    return functionality, h1

def pattern3(pru, tags5, h1):
    functionality = ''
    j = 0
    while (j < (len(pru)-3)):
        if pru[j].pos_ in tags5 and pru[j+1].pos_ == 'VERB' and pru[j+2].pos_ == 'NOUN':
            func = [pru[j+2].text, pru[j+3].text]
            functionality = ' '.join(func)
            h1 = h1 + 1
            return functionality, h1
        else:
            j = j + 1
    return functionality, h1

def hypothesis2(pru):
    j = 0
    while (j < (len(pru)-3)):
        if pru[j].text == 'não' and pru[j+1].text == 'conseguir':
            return 1
        else:
            j = j + 1
    return 0

functionalities = [] 

tags1 = ['ADV']
tags2 = ['VERB']
tags3 = ['VERB', 'DET', 'PROPN', 'NOUN', 'PRON', 'ADP', 'ADV'] 
tags4 = ['VERB', 'PROPN', 'NOUN', 'ADV', 'AUX', 'ADJ']
tags5 = ['ADJ', 'ADV', 'PROPN', 'NOUN', 'VERB']

h1 = 0
h2 = 0
h3 = 0

for i in range(len(cln)):
    pru = nlp(clean['CommentClean'][i])
    functionality = '' 
    if len(pru) > 3:
        functionality, h3 = pattern1(pru, tags1, tags2, h3) 
        if len(functionality) == 0:
            functionality, h1 = pattern2(pru, tags3, tags4, h1) 
        if len(functionality) == 0:
            functionality, h1 = pattern3(pru, tags5, h1) 
        if len(functionality) == 0:
            functionality = '-'
    else:
        functionality = '-'
    h2 = h2 + (hypothesis2(pru))
    functionalities.append(functionality)
h1 = h1 + h3

ser = pd.DataFrame(data=np.array(functionalities), index= range(len(data)), columns= ['Funcionalidade'])
df = pd.concat([data, ser], axis=1)
df.to_excel('Resultado.xlsx', index=False)

print("Quantidades de PRUS pertencentes a hipótese 1: " + str(h1))
print("Quantidades de PRUS pertencentes a hipótese 2: " + str(h2))
print("Quantidades de PRUS pertencentes a hipótese 3: " + str(h3))

Quantidades de PRUS pertencentes a hipótese 1: 17
Quantidades de PRUS pertencentes a hipótese 2: 9
Quantidades de PRUS pertencentes a hipótese 3: 8
