# Aula 11 - Processamento de Linguagem Natural

Na aula de hoje, vamos explorar os seguintes tópicos em Python:

- 1) Dados Estruturados e Não Estruturados
- 2) Introdução a NLP
- 3) Processamento de Textos
- 4) Exercícios
- 5) Curva ROC-AUC

<img src="https://i1.wp.com/thedatascientist.com/wp-content/uploads/2018/09/data_science_wordcloud.png?fit=1584%2C1008&ssl=1" width=800>

##   

## Exercícios

**1)** Usando a base *spamham.csv*, faça o processamento dos textos aplicando as limpezas necessárias para tal. Tente levantar o vocabulário dos e-mails e print o top 10 palavras deste dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('rslp')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package punkt to C:\Users\ITX
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\ITX
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to C:\Users\ITX
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [None]:
spamham = pd.read_csv('./datasets/spamham.csv')

In [None]:
spamham.head()

In [4]:
spamham['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
stopwords = set(stopwords.words('english'))

stemmer = PorterStemmer()

# Pipeline - Text Preprocessing
def preprocessing(string):
    ###
    # Remove links with http/https
#     string = re.sub(r'http\S+', '', string, flags=re.MULTILINE)
#     ###
#     # Remove hashtags
#     string = re.sub(r'#(\w+)', '', string, flags=re.MULTILINE)
#     ###
#     # Remove mentions
#     string = re.sub(r'@(\w+)', '', string, flags=re.MULTILINE)
    ###
    # Remove Numbers
    string = re.sub(r'\d', '', string)
    ###
    # Remove Special Characters
    string = re.sub(r"[^a-zA-Z0-9]+", ' ', string)
    ###
    # Lowercase words
    string = string.lower()
    ###
    # Word Tokenize
    words = word_tokenize(string)
    ###
    # Remove Stopwords
    filtered_words = []
    for w in words:
        if w not in stopwords:
            filtered_words.append(w)
    ###
    # Stemming Words
    stem_words = []
    for w in filtered_words:
        s_words = stemmer.stem(w)
        stem_words.append(s_words)
    ###
    return stem_words

In [6]:
spamham["filtered_words"] = spamham['text'].apply(lambda x: preprocessing(x))

# For each row, join the tokens in a string

spamham['join_words'] = spamham['filtered_words'].apply(lambda x: ' '.join(x))

In [7]:
spamham.head()

Unnamed: 0,id,keyword,location,text,target,filtered_words,join_words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]",forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, shelter, place, notifi, offic, ev...",resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[peopl, receiv, wildfir, evacu, order, califor...",peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...",got sent photo rubi alaska smoke wildfir pour ...


In [None]:
vocabulario = []
for frase in spamham['join_words']:
    for palavra in frase.split():
        
        #não queremos palavras de uma única letra (pode acontecer devido ao stemming...)
        if len(palavra) > 1:
            if palavra not in [x[0] for x in vocabulario]:
                vocabulario.append([palavra, 1])
            else:
                vocabulario[[x[0] for x in vocabulario].index(palavra)][1] += 1
            
print("\nO vocabulário é formado por N =", len(vocabulario), "palavras!")

#a partir do vocabulário, crio um dataframe com a contagem
vocab_count = pd.DataFrame({"palavra": [],
                            "count": []})

vocab_count["palavra"] = pd.Series(vocabulario).apply(lambda x: x[0])
vocab_count["count"] = pd.Series(vocabulario).apply(lambda x: x[1])
vocab_count = vocab_count.sort_values("count", ascending=False)
#
print("\nTemos a seguir as 10 mais comuns, com as respectivas contagens:")
display(vocab_count.head(10))

## 

**2)** Utilizando os dados de tweets vamos avaliar  tweets são de desastres ou não. Essa base é um dataset conhecido do Kaggle, onde vocês podem ter mais detalhes [clicando aqui](https://www.kaggle.com/c/nlp-getting-started/overview). Faça o processamento dos textos aplicando as limpezas necessárias para tal. Tente levantar o vocabulário dos e-mails e print o top 10 palavras deste dataset.

In [2]:
tweets = pd.read_csv('./datasets/tweets.csv')

In [3]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tweets['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
stopwords = set(stopwords.words('english'))

stemmer = PorterStemmer()

# Pipeline - Text Preprocessing
def preprocessing(string):
    ###
    # Remove links with http/https
#     string = re.sub(r'http\S+', '', string, flags=re.MULTILINE)
#     ###
#     # Remove hashtags
#     string = re.sub(r'#(\w+)', '', string, flags=re.MULTILINE)
#     ###
#     # Remove mentions
#     string = re.sub(r'@(\w+)', '', string, flags=re.MULTILINE)
    ###
    # Remove Numbers
    string = re.sub(r'\d', '', string)
    ###
    # Remove Special Characters
    string = re.sub(r"[^a-zA-Z0-9]+", ' ', string)
    ###
    # Lowercase words
    string = string.lower()
    ###
    # Word Tokenize
    words = word_tokenize(string)
    ###
    # Remove Stopwords
    filtered_words = []
    for w in words:
        if w not in stopwords:
            filtered_words.append(w)
    ###
    # Stemming Words
    stem_words = []
    for w in filtered_words:
        s_words = stemmer.stem(w)
        stem_words.append(s_words)
    ###
    return stem_words

In [6]:
tweets["filtered_words"] = tweets['text'].apply(lambda x: preprocessing(x))

# For each row, join the tokens in a string

tweets['join_words'] = tweets['filtered_words'].apply(lambda x: ' '.join(x))

In [7]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,filtered_words,join_words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]",forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, shelter, place, notifi, offic, ev...",resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[peopl, receiv, wildfir, evacu, order, califor...",peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...",got sent photo rubi alaska smoke wildfir pour ...


In [None]:
vocabulario = []
for frase in tweets['join_words']:
    for palavra in frase.split():
        
        #não queremos palavras de uma única letra (pode acontecer devido ao stemming...)
        if len(palavra) > 1:
            if palavra not in [x[0] for x in vocabulario]:
                vocabulario.append([palavra, 1])
            else:
                vocabulario[[x[0] for x in vocabulario].index(palavra)][1] += 1
            
print("\nO vocabulário é formado por N =", len(vocabulario), "palavras!")

#a partir do vocabulário, crio um dataframe com a contagem
vocab_count = pd.DataFrame({"palavra": [],
                            "count": []})

vocab_count["palavra"] = pd.Series(vocabulario).apply(lambda x: x[0])
vocab_count["count"] = pd.Series(vocabulario).apply(lambda x: x[1])
vocab_count = vocab_count.sort_values("count", ascending=False)
#
print("\nTemos a seguir as 10 mais comuns, com as respectivas contagens:")
display(vocab_count.head(10))

##   