<a href="https://colab.research.google.com/github/jrhumberto/cd/blob/main/002_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Fontes:
- https://www.linkedin.com/pulse/classifica%C3%A7%C3%A3o-de-textos-em-python-luiz-felipe-araujo-nunes/?originalSubdomain=pt
- https://github.com/luizfan/nlp

In [4]:
import nltk 
from nltk.stem import RSLPStemmer
nltk.download('stopwords')
nltk.download('rslp') #Stemming
nltk.download('punkt') #Tokenizacao

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def Tokenize(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence)
    return sentence

In [6]:
frase = Tokenize("Eu gosto de correr")
print(frase)

['eu', 'gosto', 'de', 'correr']


In [7]:
def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

In [8]:
frase1 = Tokenize("Eu gosto de correr")
frase2 = Tokenize("Eu gosto de corrida")
frase1 = Stemming(frase1)
frase2 = Stemming(frase2)
print(frase1)
print(frase2)

['eu', 'gost', 'de', 'corr']
['eu', 'gost', 'de', 'corr']


In [9]:
def RemoveStopWords(sentence):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    phrase = []
    for word in sentence:
        if word not in stopwords:
            phrase.append(word)
    return phrase

In [10]:
frase1 = RemoveStopWords(frase1)
frase2 = RemoveStopWords(frase2)
print(frase1)
print(frase2)

['gost', 'corr']
['gost', 'corr']


In [11]:
#Base de exemplos
def Train():
    training_data = []
    training_data.append({"classe":"amor", "frase":"Eu te amo"})
    training_data.append({"classe":"amor", "frase":"Você é o amor da minha vida"})
    training_data.append({"classe":"medo", "frase":"estou com medo"})
    training_data.append({"classe":"medo", "frase":"tenho medo de fantasma"})
    print("%s frases incluidas" % len(training_data))
    return training_data

In [12]:
dados = Train()

4 frases incluidas


In [13]:
def Learning(training_data):
    corpus_words = {}
    for data in training_data:
        frase = data['frase']
        frase = Tokenize(frase)
        frase = Stemming(frase)
        frase = RemoveStopWords(frase)
        class_name = data['classe']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in frase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words

In [14]:
dados = Learning(dados)
print(dados)

{'amor': {'amo': 1, 'voc': 1, 'am': 1, 'minh': 1, 'vid': 1}, 'medo': {'est': 1, 'med': 2, 'tenh': 1, 'fantasm': 1}}


In [15]:
def calculate_class_score(sentence,class_name):
    score = 0 
    sentence = Tokenize(sentence)
    sentence = Stemming(sentence)
    for word in sentence:
        if word in dados[class_name]:
            score += dados[class_name][word]
    return score

In [16]:
score = calculate_class_score("tenho medo de baratas","medo")
print(score)

3


In [17]:
def calculate_score(sentence):
    high_score = 0
    classname = 'default'
    for classe in dados.keys():
        pontos = 0
        pontos = calculate_class_score(sentence,classe)
        if pontos > high_score:
            high_score = pontos
            classname = classe
    return classname,high_score

In [18]:
print(calculate_score("eu amo aquela casa"))

('amor', 1)


In [22]:
# train.py
import nltk 
import re
import yaml
from nltk.stem import RSLPStemmer

stemmer = RSLPStemmer()
expression = '[!-@[-`{-¿ÆÐÑ×ØÝ-ßä-æëðñö-øý-ÿ]'

#Carrega o Corpus Words
def LoadMemory():
    fileW = open("words.nlp", 'r')
    words = fileW.read()
    fileW.close()
    words = yaml.load(words)
    return words

#Carrega as frases que foram treinadas
def LoadExamples():
    fileE = open("examples.nlp", 'r')
    examples = fileE.read()
    fileE.close()
    return examples

#Salva a corpus words
def SaveMemory(w):
    fileW = open("words.nlp", 'w')
    print("Salvando...")
    fileW.write(str(w))
    fileW.close()

#Salva as novas frases treinadas
def SaveExample(example):
    fileE = open("examples.nlp", 'a')
    fileE.write(example + "\n")
    fileE.close()

#Massa de dados para exemplo
def Examples():
    training_data = []
    training_data.append({"class":"saudade", "sentence":"sinto sua falta"})
    training_data.append({"class":"saudade", "sentence":"estou com saudades"})

    training_data.append({"class":"fome", "sentence":"estou com fome"})
    training_data.append({"class":"fome", "sentence":"to faminto"})

    training_data.append({"class":"medo", "sentence":"to com medo"})
    training_data.append({"class":"medo", "sentence":"tomei um susto"})

    Learning(training_data)

#Função responsavel por treinar a frase
def Learning(training_data):
    corpus_words = LoadMemory()

    for data in training_data:
        examples = LoadExamples()
        sentence = data['sentence']
        sentence = re.sub(expression, '', sentence)
        sentence = stemmer.stem(sentence.lower())


        if sentence in examples:
            continue
        
        SaveExample(sentence)
        sentence = nltk.word_tokenize(sentence)
        class_name = data['class']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in sentence:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1

    
SaveMemory(corpus_words)

In [20]:
# classification.py



import nltk 
import re
import yaml
from nltk.stem import RSLPStemmer

expression = '[!-@[-`{-¿ÆÐÑ×ØÝ-ßä-æëðñö-øý-ÿ]'
stemmer = RSLPStemmer()


#Carrega o Corpus Words
def LoadMemory():
    fileW = open("words.nlp", 'r')
    words = fileW.read()
    words = yaml.load(words)
    return words

#Função responsavel por calcular a pontuação por classe
def calculate_class_score(sentence,class_name):
    score = 0
    sentence = re.sub(expression, '', sentence)
    sentence = nltk.word_tokenize(sentence)
    for word in sentence:
        if stemmer.stem(word.lower()) in corpus_words[class_name]:
            score += corpus_words[class_name][stemmer.stem(word.lower())]
    return score

#Função responsavel por classificar a frase
def classifique(sentence):
    high_class = None
    high_score = 0
    for c in list(corpus_words.keys()):
        score = calculate_class_score(sentence, c)
        if score > high_score:
            high_class = c
            high_score = score

    print(str(high_class))
    return high_class

memory = LoadMemory()
corpus_words = memory


FileNotFoundError: ignored

# Arquivos Finais: utils.py, train.py, classify.py, server.py, calculator.py, answer.py

Files de textos:
- https://raw.githubusercontent.com/luizfan/nlp/master/text/answer.txt

- https://raw.githubusercontent.com/luizfan/nlp/master/text/corpus.txt

- https://raw.githubusercontent.com/luizfan/nlp/master/text/stopwords.txt

In [None]:
# answer.py
from utils import save_answer,load_answer

def return_answer(class_name):
    answers = load_answer()
    try:
        return answers[class_name]
    except:
        return 'Não entendi :('

def include_answer(class_name,answer):
    answers = load_answer()
    answers[class_name] = answer
    save_answer(answers)

In [None]:
# calculator.py
from utils import normalize,stemming,remove_stopwords,load_corpus

def calculate_class_score(sentence,class_name):
    score = 0 
    sentence = normalize(sentence)
    sentence = remove_stopwords(sentence)
    sentence = stemming(sentence)
    dados = load_corpus()
    for word in sentence:
        if word in dados[class_name]:
            score += dados[class_name][word]
    return score

def calculate_score(sentence):
    high_score = 0
    classname = 'default'
    dados = load_corpus()
    for classe in dados.keys():
        pontos = 0
        pontos = calculate_class_score(sentence,classe)
        if pontos > high_score:
            high_score = pontos
            classname = classe
    return {'classname':classname,'high_score':high_score}

In [None]:
# train.py
from utils import normalize,stemming,remove_stopwords,load_corpus

def learning(training_data):
    corpus_words = load_corpus()
    for data in training_data:
        phrase = data['phrase']
        phrase = normalize(phrase)
        phrase = remove_stopwords(phrase)
        phrase = stemming(phrase)

        class_name = data['class']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in phrase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    return corpus_words


def sample():
    training_data = []
    training_data.append({"class":"amor", "phrase":"Eu te amo"})
    training_data.append({"class":"amor", "phrase":"Você é o amor da minha vida"})
    training_data.append({"class":"medo", "phrase":"estou com medo"})
    training_data.append({"class":"medo", "phrase":"tenho medo de fantasma"})
    training_data.append({"class":"fome", "phrase":"eu estou com fome"})
    training_data.append({"class":"fome", "phrase":"estou faminto"})
    print("%s phrases included" % len(training_data))
    return training_data

In [None]:
# utils.py
import yaml
import nltk
import unicode
import unicodedata
import codecs
from nltk.stem import RSLPStemmer

def normalize(sentence):
    sentence = sentence.lower()
    sentence = strip_accents(sentence)
    sentence = nltk.word_tokenize(sentence)
    return sentence

def strip_accents(sentence):
    try:
        sentence = unicode(sentence, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    sentence = unicodedata.normalize('NFD', sentence)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(sentence)

def remove_stopwords(sentence):
    stopwords = load_stopword()
    phrase = []
    for word in sentence:
        if word not in stopwords:
            phrase.append(word)
    return phrase

def stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

def save_corpus(w):
    fileW = codecs.open("text/corpus.txt", "w", "utf-8")
    fileW.write(str(w))
    fileW.close()

def load_corpus():
    with open("text/corpus.txt", "rb") as f:
        words = f.read().decode("UTF-8")
    words = yaml.load(words,Loader=yaml.FullLoader)
    if words is None:
        return {}
    return words

def load_stopword():
    with open('text/stopwords.txt', 'r') as f:
        stopwords = [strip_accents(line.strip()) for line in f] 
    return stopwords

def save_answer(w):
    fileW = codecs.open("text/answer.txt", "w", "utf-8")
    fileW.write(str(w))
    fileW.close()

def load_answer():
    with open("text/answer.txt", "rb") as f:
        answer = f.read().decode("UTF-8")
    answer = yaml.load(answer,Loader=yaml.FullLoader)
    if answer is None:
        return {}
    return answer

In [24]:
# server.py
'''
from flask import Flask,json,request
from train import learning,sample
from utils import save_corpus,normalize,remove_stopwords,stemming
from answer import return_answer,include_answer
from calculator import calculate_score

app = Flask(__name__)

@app.route("/", methods = ['GET'])
def health_check():
    return create_response(200,{"status":"UP"})

@app.route("/train", methods = ['GET'])
def train_with_examples():
    save_corpus(learning(sample()))
    return create_response(200,{"status":"sample phrases included"})

@app.route("/train", methods = ['POST'])
def train():
    phrase = request.form.get('phrase')
    class_name = request.form.get('class')
    save_corpus(learning([{'class':class_name,'phrase':phrase}]))
    return create_response(200,{"status":"phrase included"})

@app.route("/classify", methods = ['GET'])
def classify():
    phrase = request.form.get('phrase')
    return create_response(200,calculate_score(phrase))

@app.route("/chat", methods = ['GET'])
def chat():
    phrase = request.form.get('phrase')
    return create_response(200,{'answer':return_answer(calculate_score(phrase)['classname'])})

@app.route("/answer", methods = ['POST'])
def save_answer():
    answer = request.form.get('answer')
    classname = request.form.get('class')
    include_answer(classname,answer)
    return create_response(200,{"status":"answer included"})

@app.route("/normalize", methods = ['GET'])
def return_normalize():
    phrase = request.form.get('phrase')
    return create_response(200,{"phrase":normalize(phrase)})

@app.route("/stopwords", methods = ['GET'])
def return_remove_stopwords():
    phrase = request.form.get('phrase')
    return create_response(200,{"phrase":remove_stopwords(normalize(phrase))})

@app.route("/stemming", methods = ['GET'])
def return_stemming():
    phrase = request.form.get('phrase')
    return create_response(200,{"phrase":stemming(remove_stopwords(normalize(phrase)))})

def create_response(statusCode, data):
    response = app.response_class(
        response=json.dumps(data),
        status=statusCode,
        mimetype='application/json'
    )
    return response

app.run(host='127.0.0.1', port=8081)
'''

'\nfrom flask import Flask,json,request\nfrom train import learning,sample\nfrom utils import save_corpus,normalize,remove_stopwords,stemming\nfrom answer import return_answer,include_answer\nfrom calculator import calculate_score\n\napp = Flask(__name__)\n\n@app.route("/", methods = [\'GET\'])\ndef health_check():\n    return create_response(200,{"status":"UP"})\n\n@app.route("/train", methods = [\'GET\'])\ndef train_with_examples():\n    save_corpus(learning(sample()))\n    return create_response(200,{"status":"sample phrases included"})\n\n@app.route("/train", methods = [\'POST\'])\ndef train():\n    phrase = request.form.get(\'phrase\')\n    class_name = request.form.get(\'class\')\n    save_corpus(learning([{\'class\':class_name,\'phrase\':phrase}]))\n    return create_response(200,{"status":"phrase included"})\n\n@app.route("/classify", methods = [\'GET\'])\ndef classify():\n    phrase = request.form.get(\'phrase\')\n    return create_response(200,calculate_score(phrase))\n\n@app