## Experimentos de classificação com a planilha de oportunidades em texto

https://github.com/jairoalves/mcti

In [1]:
import pandas as pd
import re

In [2]:
arq = r"C:\Users\u495\GitHub\MCTI\NLP\dados\entrada\oportunidades_classificacao.xlsx"

In [3]:
df = pd.read_excel(arq)

In [4]:
df.head(14)

Unnamed: 0,link,opo_brazil,clas,opo_titulo,opo_deadline,codigo,opo_texto,opo_tipo,atualizacao
0,https://royalsociety.org/grants-schemes-awards...,N,N,APEX Award,28 October 2021,royal_210223_01_014,Skip to contentGo\n Sea...,other,210223
1,https://royalsociety.org/grants-schemes-awards...,N,N,Dorothy Hodgkin Fellowship,10 November 2021,royal_210223_01_015,Skip to contentGo\n Sea...,fellowship,210223
2,https://royalsociety.org/grants-schemes-awards...,N,N,International Exchanges Scheme Standard round one,10 March 2021,royal_210223_01_001,Skip to contentGo\n Sea...,other,210223
3,https://royalsociety.org/grants-schemes-awards...,N,N,JSPS Postdoctoral Fellowship,10 March 2021,royal_210223_01_002,Skip to contentGo\n Sea...,fellowship,210223
4,https://royalsociety.org/grants-schemes-awards...,N,N,Lisa Jardine Grant scheme round one,17 March 2021,royal_210223_01_000,Skip to contentGo\n Sea...,grant,210223
5,https://royalsociety.org/grants-schemes-awards...,N,Y,Royal Society Wolfson Fellowship round one,17 March 2021,royal_210223_01_003,Skip to contentGo\n Sea...,fellowship,210223
6,https://royalsociety.org/grants-schemes-awards...,N,Y,Royal Society Wolfson Visiting Fellowship roun...,17 March 2021,royal_210223_01_004,Skip to contentGo\n Sea...,fellowship,210223
7,https://royalsociety.org/grants-schemes-awards...,N,N,Short Industry Fellowship round one,27 May 2021,royal_210223_01_006,Skip to contentGo\n Sea...,fellowship,210223
8,https://royalsociety.org/grants-schemes-awards...,N,Y,University Research Fellowship,7 September 2021,royal_210223_01_011,Skip to contentGo\n Sea...,fellowship,210223
9,https://wellcome.org/grant-funding/schemes/bio...,Y,Y,Biomedical Resource Grants,Preliminary application deadline12 January 202...,wellcome_210304_01_028,Grant Funding/Schemes/ ...,grant,210304


### Variáveis de interesse

In [5]:
# Coleta das variáveis de interesse
X = df[['opo_texto']].copy()
y = df['clas'].copy()

### Pré-processamento

In [6]:
class Sentenca():

    def __init__(self, sentenca):
        self.sent_bruta = sentenca
        self.preproc()
    
    def remove_caracteres_nao_alfanumericos(self):
        # padroes para trechos nao alfanumericos
        ptn_nao_alfanum = r"[\W+]"
        self.sent_preproc = re.sub(ptn_nao_alfanum, ' ', self.sent_bruta)
    
    def remove_espacos_multiplos(self):
        ptn_espacos_mult = r"\s+"  
        self.sent_preproc = re.sub(ptn_espacos_mult, ' ', self.sent_preproc)
        self.sent_preproc = self.sent_preproc.strip()
    
    def remove_b_inicial(self):
        if self.sent_preproc.startswith('b '):
            self.sent_preproc = self.sent_preproc[2:]
    
    def separa_palavras_coladas(self):
        """Separa com espaço palavras coladas, aqui definido quando uma letra 
        minúscula está colada com uma maiúscula imediatalmente posterior"""
        ptn_ltr_minusc_colada_maiuscula = r'([a-z])([A-Z])'
        ptn_algarismo_colado_maiuscula = r'([0-9])([A-Z])'
        
        self.sent_preproc = re.sub(ptn_ltr_minusc_colada_maiuscula, r'\1 \2', self.sent_preproc)
        self.sent_preproc = re.sub(ptn_algarismo_colado_maiuscula, r'\1 \2', self.sent_preproc)
    
    def preproc(self):
        self.sent_preproc = ''
        self.remove_caracteres_nao_alfanumericos()
        self.remove_espacos_multiplos()
        self.remove_b_inicial()
        self.separa_palavras_coladas()
        self.sent_preproc = self.sent_preproc.lower()
        
        return self.sent_preproc
    
    def __getitem__(self, indices):
        return ''.join(self.sent_preproc[indices])
    
    def __str__(self):
        return str(self.sent_preproc)
    
    def __repr__(self):
        return self.sent_preproc

In [7]:
X['opo_texto_preproc'] = X['opo_texto'].apply(Sentenca)

In [8]:
X['opo_texto_preproc'].iloc[0][:306]

'skip to content go search royal society org the royal society venue hire contact us fellow login search show navigation home fellows events grants schemes awards topics policy journals collections about us what s new search the fellows directory search for past fellows about elections biographical memoirs'

In [9]:
X.head(20)

Unnamed: 0,opo_texto,opo_texto_preproc
0,Skip to contentGo\n Sea...,skip to content go search royal society org th...
1,Skip to contentGo\n Sea...,skip to content go search royal society org th...
2,Skip to contentGo\n Sea...,skip to content go search royal society org th...
3,Skip to contentGo\n Sea...,skip to content go search royal society org th...
4,Skip to contentGo\n Sea...,skip to content go search royal society org th...
5,Skip to contentGo\n Sea...,skip to content go search royal society org th...
6,Skip to contentGo\n Sea...,skip to content go search royal society org th...
7,Skip to contentGo\n Sea...,skip to content go search royal society org th...
8,Skip to contentGo\n Sea...,skip to content go search royal society org th...
9,Grant Funding/Schemes/ ...,grant funding schemes current page biomedical ...


### Stopwords

In [10]:
#import nltk
#nltk.download('stopwords')

In [11]:
from nltk.corpus import stopwords

In [12]:
stop_ingles = stopwords.words('english')

In [13]:
def remove_stopwords(sentenca):
    tokens = str(sentenca).split(' ')
    tokens_sem_stops = [token for token in tokens if token not in stop_ingles]
    return ' '.join(tokens_sem_stops)

In [14]:
X['opo_texto_sem_stop'] = X['opo_texto_preproc'].apply(remove_stopwords)

In [15]:
X.head(20)

Unnamed: 0,opo_texto,opo_texto_preproc,opo_texto_sem_stop
0,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
1,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
2,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
3,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
4,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
5,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
6,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
7,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
8,Skip to contentGo\n Sea...,skip to content go search royal society org th...,skip content go search royal society org royal...
9,Grant Funding/Schemes/ ...,grant funding schemes current page biomedical ...,grant funding schemes current page biomedical ...


In [16]:
print(X['opo_texto_sem_stop'][0][:302], '\n...')

skip content go search royal society org royal society venue hire contact us fellow login search show navigation home fellows events grants schemes awards topics policy journals collections us new search fellows directory search past fellows elections biographical memoirs public events scientific meet 
...


### Tokenização

In [17]:
#import nltk
#nltk.download('punkt')

In [18]:
from nltk.tokenize import word_tokenize

In [19]:
X['opo_texto_tokens'] = X['opo_texto_sem_stop'].apply(word_tokenize)

In [20]:
print(X['opo_texto_tokens'].iloc[0][:50], '\n...')

['skip', 'content', 'go', 'search', 'royal', 'society', 'org', 'royal', 'society', 'venue', 'hire', 'contact', 'us', 'fellow', 'login', 'search', 'show', 'navigation', 'home', 'fellows', 'events', 'grants', 'schemes', 'awards', 'topics', 'policy', 'journals', 'collections', 'us', 'new', 'search', 'fellows', 'directory', 'search', 'past', 'fellows', 'elections', 'biographical', 'memoirs', 'public', 'events', 'scientific', 'meetings', 'summer', 'science', 'online', 'planet', 'grants', 'awards', 'training'] 
...


### Lematização

In [21]:
#import nltk
#nltk.download('wordnet')

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
wordnet = WordNetLemmatizer()

In [24]:
def lematiza_tokens(tokens):
    return [wordnet.lemmatize(token) for token in tokens]

In [25]:
# lematização dos tokens
X['opo_texto_tokens_lem'] = X['opo_texto_tokens'].apply(lematiza_tokens)
X['opo_texto_sem_stop_lem'] = X['opo_texto_tokens_lem'].apply(lambda l: ' '.join(l))

Tokenização e Lematização:

In [26]:
X[['opo_texto_tokens', 'opo_texto_tokens_lem', 'opo_texto_sem_stop_lem']].head(15)

Unnamed: 0,opo_texto_tokens,opo_texto_tokens_lem,opo_texto_sem_stop_lem
0,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
1,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
2,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
3,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
4,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
5,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
6,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
7,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
8,"[skip, content, go, search, royal, society, or...","[skip, content, go, search, royal, society, or...",skip content go search royal society org royal...
9,"[grant, funding, schemes, current, page, biome...","[grant, funding, scheme, current, page, biomed...",grant funding scheme current page biomedical r...


### Bag of Words

In [27]:
from collections import Counter

In [28]:
X['opo_texto_bow'] = X['opo_texto_tokens'].apply(Counter)
X['opo_texto_bow_lem'] = X['opo_texto_tokens_lem'].apply(Counter)

Bag of words com e sem lematização

In [29]:
X[['opo_texto_tokens', 'opo_texto_bow', 'opo_texto_bow_lem']].head(15)

Unnamed: 0,opo_texto_tokens,opo_texto_bow,opo_texto_bow_lem
0,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
1,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
2,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
3,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 1, 'search': 1...","{'skip': 1, 'content': 3, 'go': 1, 'search': 1..."
4,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 4, 'go': 1, 'search': 1...","{'skip': 1, 'content': 4, 'go': 1, 'search': 1..."
5,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
6,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
7,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
8,"[skip, content, go, search, royal, society, or...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1...","{'skip': 1, 'content': 3, 'go': 2, 'search': 1..."
9,"[grant, funding, schemes, current, page, biome...","{'grant': 34, 'funding': 15, 'schemes': 4, 'cu...","{'grant': 47, 'funding': 15, 'scheme': 9, 'cur..."


### Mapeamento do Corpus em Dicionário

Vamos passar a usar números para representar cada token, por meio da criação de um `dicionario_corpus`.

In [30]:
from gensim.corpora.dictionary import Dictionary

In [31]:
dicionario_corpus = Dictionary(X['opo_texto_tokens'].tolist() + X['opo_texto_tokens_lem'].tolist())

Resultado do mapeamento:

In [32]:
print('Dicionario do corpus:\n\n',
      {k: v for i, (k, v) in enumerate(dicionario_corpus.token2id.items()) if i < 80}, '\n...', sep='')

Dicionario do corpus:

{'00': 0, '000': 1, '000â': 2, '1': 3, '10': 4, '100': 5, '17': 6, '173': 7, '19': 8, '1kb': 9, '20': 10, '2017': 11, '2018': 12, '2019â': 13, '2020': 14, '2020â': 15, '2021': 16, '207': 17, '207043': 18, '24': 19, '25': 20, '2500': 21, '2666': 22, '28': 23, '295': 24, '3pm': 25, '44': 26, '451': 27, '5': 28, '6': 29, '7': 30, '7451': 31, '7kb': 32, '8kb': 33, '9': 34, '92': 35, 'academies': 36, 'academiesâ': 37, 'academy': 38, 'access': 39, 'across': 40, 'activities': 41, 'addition': 42, 'adjustment': 43, 'administrative': 44, 'advancing': 45, 'ag': 46, 'also': 47, 'apex': 48, 'applicant': 49, 'applicants': 50, 'application': 51, 'applications': 52, 'apply': 53, 'applying': 54, 'area': 55, 'around': 56, 'articles': 57, 'authors': 58, 'available': 59, 'award': 60, 'awards': 61, 'bank': 62, 'based': 63, 'benefit': 64, 'biographical': 65, 'blog': 66, 'book': 67, 'boundary': 68, 'brexit': 69, 'british': 70, 'browser': 71, 'call': 72, 'carlton': 73, 'catalogues': 74,

Exemplo de consulta ao dicionário:

In [33]:
dicionario_corpus.token2id['grant']

156

In [34]:
dicionario_corpus.get(156)

'grant'

### Bag of Words com Dicionário

Vamos criar duas novas colunas fazendo `bag of words` de pares de inteiros para o texto normal e para o lematizado.
O primeiro elemento deste par é o `id` do token no `dicionario_corpus` e o segundo elemento é a contagem de ocorrências deste token no documento.

Estamos convencionando chamar as colunas inteiras de `'opo_int_...'`

In [35]:
# Criação dos bag of words para o texto normal e lematizado
X['opo_int_bow'] = X['opo_texto_tokens'].apply(dicionario_corpus.doc2bow)
X['opo_int_bow_lem'] = X['opo_texto_tokens_lem'].apply(dicionario_corpus.doc2bow)

Resultado dos bag of words após mapeamento em dicionário

In [36]:
X[['opo_texto_tokens', 'opo_int_bow', 'opo_int_bow_lem']].head(15)

Unnamed: 0,opo_texto_tokens,opo_int_bow,opo_int_bow_lem
0,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 1), (2, 1), (3, 3), (4, 1), (5, 1...","[(0, 2), (1, 1), (2, 1), (3, 3), (4, 1), (5, 1..."
1,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 3), (4, 1), (6, 1), (8, 2), (10, ...","[(0, 2), (1, 3), (4, 1), (6, 1), (8, 2), (10, ..."
2,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 5), (4, 2), (6, 1), (8, 2), (10, ...","[(0, 2), (1, 5), (4, 2), (6, 1), (8, 2), (10, ..."
3,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 2), (3, 1), (4, 1), (6, 1), (8, 2...","[(0, 2), (1, 2), (3, 1), (4, 1), (6, 1), (8, 2..."
4,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 2), (3, 3), (6, 2), (8, 3), (14, ...","[(0, 2), (1, 2), (3, 3), (6, 2), (8, 3), (14, ..."
5,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 1), (4, 1), (6, 2), (8, 3), (10, ...","[(0, 2), (1, 1), (4, 1), (6, 2), (8, 3), (10, ..."
6,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 3), (4, 1), (6, 2), (8, 3), (10, ...","[(0, 2), (1, 3), (4, 1), (6, 2), (8, 3), (10, ..."
7,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 1), (3, 2), (6, 1), (8, 2), (10, ...","[(0, 2), (1, 1), (3, 2), (6, 1), (8, 2), (10, ..."
8,"[skip, content, go, search, royal, society, or...","[(0, 2), (1, 3), (6, 1), (8, 2), (9, 1), (10, ...","[(0, 2), (1, 3), (6, 1), (8, 2), (9, 1), (10, ..."
9,"[grant, funding, schemes, current, page, biome...","[(0, 3), (1, 5), (3, 11), (4, 3), (5, 3), (6, ...","[(0, 3), (1, 5), (3, 11), (4, 3), (5, 3), (6, ..."


### TF-IDF

Term Frequency - Inverse Document Frequency

In [37]:
from gensim.models.tfidfmodel import TfidfModel

In [38]:
def tfdif_palavras_mais_representativas(col_tfidf, dicionario, top=5):
    palavras_mais_repr = []
    for idx, tfidf_doc in enumerate(col_tfidf):
        palavras =[]
        # lista ordenada pelo peso tfidf do termo
        tfidf_desc = sorted(tfidf_doc, key=lambda termo: termo[1], reverse=True)

        # lista no tamanho especificado
        tfidf_desc_tam = tfidf_desc[:top]

        # conversão dos tokenids para palavras
        palavras = [(dicionario.get(tokenid), peso) for tokenid, peso in tfidf_desc_tam]
        palavras_mais_repr.append({f'Palavra_Rank_{rank + 1}': palavras[rank] for rank in range(len(palavras))})

    return pd.DataFrame(palavras_mais_repr)

In [39]:
def gera_tfidf_mais_representativos(serie_int_bow, tam=10):
    """Gera um dataframe com os dados de tfidf
    para os `tam` tokens mais representativos de cada documento"""
    
    corpus = serie_int_bow.to_list()
    tfidf = TfidfModel(corpus=corpus)
    
    tfidf_docs = []
    for idx, doc in enumerate(corpus):
        tfidf_doc = tfidf[doc]
    
        # lista ordenada pelo peso tfidf do termo
        tfidf_desc = sorted(tfidf_doc, key=lambda termo: termo[1], reverse=True)

        # lista no tamanho especificado
        tfidf_desc_tam = tfidf_desc[:tam]

        tfidf_docs.append({f'tdidf_desc_tam_{tam}': tfidf_desc_tam})
        
    return pd.DataFrame(tfidf_docs)

In [40]:
# Gera colunas com os tfidfs para cada documento
X['opo_int_tfidf'] = gera_tfidf_mais_representativos(X['opo_int_bow'], tam=30)
X['opo_int_tfidf_lem'] = gera_tfidf_mais_representativos(X['opo_int_bow_lem'], tam=30)

Resultado do TF-IDF para o corpus normal e o lematizado

In [41]:
X[['opo_int_tfidf', 'opo_int_tfidf_lem']].head(15)

Unnamed: 0,opo_int_tfidf,opo_int_tfidf_lem
0,"[(48, 0.5852419808628487), (305, 0.25482896958...","[(48, 0.5912011244922791), (305, 0.25742372949..."
1,"[(111, 0.30992399496115636), (451, 0.290495776...","[(111, 0.31345516543738), (451, 0.293805588120..."
2,"[(672, 0.44624227693180224), (741, 0.313404622...","[(3542, 0.45343844147211687), (741, 0.31845862..."
3,"[(707, 0.49824169968051996), (305, 0.263958904...","[(707, 0.5070887386855577), (305, 0.2686458962..."
4,"[(967, 0.39799465030110376), (978, 0.364828429...","[(967, 0.40518569321509584), (978, 0.371420218..."
5,"[(1151, 0.39683650979336466), (293, 0.32923078...","[(1151, 0.3987789857858959), (293, 0.330842339..."
6,"[(1148, 0.4264028136991138), (1151, 0.31011113...","[(1148, 0.43081297567455795), (1151, 0.3133185..."
7,"[(176, 0.3434791064044698), (305, 0.2685837163...","[(176, 0.3529012393206013), (305, 0.2759513594..."
8,"[(111, 0.4291669460596476), (1351, 0.209297274...","[(111, 0.43415835184452745), (1351, 0.21173149..."
9,"[(91, 0.42298451835661083), (1785, 0.330111397...","[(658, 0.474333701515959), (1415, 0.2432449240..."


Checando as palavras mais importantes por documento, segundo seu TF-IDF

In [42]:
tfdif_palavras_mais_representativas(X['opo_int_tfidf_lem'], dicionario_corpus, top=8).head(15)

Unnamed: 0,Palavra_Rank_1,Palavra_Rank_2,Palavra_Rank_3,Palavra_Rank_4,Palavra_Rank_5,Palavra_Rank_6,Palavra_Rank_7,Palavra_Rank_8
0,"(apex, 0.5912011244922791)","(search, 0.2574237294995361)","(academy, 0.15362999079765494)","(journal, 0.15362999079765494)","(fellow, 0.1521140219769986)","(royal, 0.14659138810582206)","(society, 0.12110351111147696)","(topic, 0.10973570771261065)"
1,"(dr, 0.31345516543738)","(dorothy, 0.29380558812023994)","(hodgkin, 0.29380558812023994)","(search, 0.21402228419340266)","(fellow, 0.18483742725793864)","(society, 0.15822015173492798)","(royal, 0.15234523924078536)","(journal, 0.1277280910157305)"
2,"(exchange, 0.45343844147211687)","(professor, 0.31845862877311487)","(dr, 0.28916657997233086)","(search, 0.19743841793679445)","(society, 0.17249842391877052)","(royal, 0.16864862346047407)","(case, 0.12564262959614192)","(journal, 0.11783087126312472)"
3,"(jsps, 0.5070887386855577)","(search, 0.26864589629822416)","(royal, 0.19122739249724244)","(society, 0.18054702122369418)","(japanese, 0.16826564473063313)","(journal, 0.16032735853979738)","(fellow, 0.15874530235804155)","(japan, 0.13829692873242483)"
4,"(jardine, 0.40518569321509584)","(lisa, 0.3714202187805045)","(royal, 0.20721503591322246)","(history, 0.1930351023683044)","(search, 0.1617256361143132)","(society, 0.14129680347170512)","(scholar, 0.13875861049867255)","(incorporate, 0.13506189773836527)"
5,"(wolfson, 0.3987789857858959)","(royal, 0.33084233901711774)","(society, 0.29816586385539845)","(professor, 0.28396748938609373)","(search, 0.21126546477653016)","(institution, 0.16432690244618936)","(nomination, 0.1450105402857803)","(fellow, 0.14404463507490692)"
6,"(visiting, 0.43081297567455795)","(wolfson, 0.31331852776331487)","(royal, 0.3086800156338849)","(society, 0.27610078542765837)","(search, 0.2282364587448744)","(nomination, 0.15665926388165743)","(fellow, 0.1556157673260507)","(journal, 0.13621108327827092)"
7,"(industry, 0.3529012393206013)","(search, 0.2759513594512431)","(royal, 0.2749985892637328)","(society, 0.22254810501736386)","(fellow, 0.175605410559882)","(academia, 0.16540057591457386)","(journal, 0.16468724501628063)","(company, 0.12405043193593038)"
8,"(dr, 0.43415835184452745)","(urf, 0.21173149681947764)","(search, 0.2028249955432925)","(fellow, 0.17516704160557078)","(society, 0.14994233750676442)","(royal, 0.1443747906275575)","(professor, 0.13631121591524037)","(university, 0.13179969958907617)"
9,"(cost, 0.474333701515959)","(ask, 0.24324492405523657)","(allowance, 0.17249939047293733)","(pay, 0.1628463457382818)","(youâ, 0.14594695443314196)","(cover, 0.1434372194865877)","(equipment, 0.13986583133176103)","(away, 0.1359361711188955)"


### Conjuntos de Treinamento e de Teste

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X['opo_texto_sem_stop'], y, test_size=0.33)

In [45]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:50])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:15])

# Create the CountVectorizer DataFrame: count_df
#count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_df

['00', '000', '000â', '01', '03', '06', '09', '10', '100', '102', '11', '115', '12', '120', '123', '125', '127', '128', '13', '131', '137', '139', '14', '140', '148', '15', '150', '153', '16', '165', '17', '173', '17th', '18', '180', '1860', '19', '1kb', '20', '200', '2011', '2014', '2017', '2018', '2019', '2019â', '2020', '2020â', '2021', '20211']
[[0.         0.03022061 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.02949415 0.01382312 0.03862879 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00935863 0.01754457 0.         ... 0.         0.         0.        ]
 [0.00461674 0.02163742 0.         ... 0.         0.         0.        ]]


Unnamed: 0,00,000,000â,01,03,06,09,10,100,102,...,youwork,youyour,youâ,yusuf,zhang,zinke,zita,zoology,œgrant,œsupport
0,0.0,0.030221,0.0,0.0,0.0,0.0,0.0,0.010937,0.019143,0.0,...,0.0,0.0,0.153191,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.029494,0.013823,0.038629,0.0,0.0,0.0,0.0,0.017509,0.020431,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.02948,0.027633,0.0,0.0,0.0,0.0,0.0,0.0175,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03861,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.022019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.028168,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.022786,0.032038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.029843,0.029843,0.0,0.0


### Métricas

In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [47]:
def avalia_resultado(y_test, y_pred):
    print(f' Acurácia:\t{100 * accuracy_score(y_test, y_pred):.2f} %')
    print(" Matriz de Confusão:\n", confusion_matrix(y_test, y_pred, labels=['N', 'Y']))
    print(" Relatório de classificação:\n", classification_report(y_test, y_pred, labels=['N', 'Y']))

### Classificação com Naive Bayes

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [49]:
from sklearn.naive_bayes import MultinomialNB

In [50]:
#X_train, X_test, y_train, y_test = train_test_split(X['opo_texto_sem_stop'], y, test_size=0.33)
X_train, X_test, y_train, y_test = train_test_split(
    X['opo_texto_sem_stop_lem'], y, test_size=0.25, stratify=y)

In [51]:
print('Distribuição de classes')
dist_classes = pd.DataFrame({'Treino': dict(Counter(y_train)), 'Teste': dict(Counter(y_test))})
dist_classes['Treino%'] = dist_classes['Treino'].div(dist_classes['Treino'].sum()).mul(100)
dist_classes['Teste%'] = dist_classes['Teste'].div(dist_classes['Teste'].sum()).mul(100)
dist_classes

Distribuição de classes


Unnamed: 0,Treino,Teste,Treino%,Teste%
Y,21,7,58.333333,53.846154
N,15,6,41.666667,46.153846


#### Naive Bayes com Bag of Words

In [52]:
count_vectorizer = CountVectorizer(stop_words="english")
X_bow_train = count_vectorizer.fit_transform(X_train)
X_bow_test = count_vectorizer.transform(X_test)

In [53]:
def classifica_NB_bow_alpha(alpha=1):
    """Classifica TF-DF para diferentes valores de alpha"""
    print(f'\n{"-"*45}')
    print(f'Naive Bayes - BoW')
    print(f'{"-"*45}')
    classificador_bow = MultinomialNB(alpha=alpha)
    classificador_bow.fit(X_bow_train, y_train)
    y_pred_bow = classificador_bow.predict(X_bow_test)
    avalia_resultado(y_test, y_pred_bow)

In [54]:
classifica_NB_bow_alpha()


---------------------------------------------
Naive Bayes - BoW
---------------------------------------------
 Acurácia:	76.92 %
 Matriz de Confusão:
 [[5 1]
 [2 5]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.71      0.83      0.77         6
          Y       0.83      0.71      0.77         7

avg / total       0.78      0.77      0.77        13



#### Naive Bayes com TF-IDF

In [55]:
import numpy as np

In [56]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train)
X_tfidf_test = tfidf_vectorizer.transform(X_test)

In [57]:
def classifica_NB_tfidf_alpha(alpha):
    """Classifica TF-DF para diferentes valores de alpha"""
    print(f'\n{"-"*55}')
    print(f'Naive Bayes - TF-IDF')
    print(f'{"-"*55}\nAlpha = {alpha:.2f}:\n{"-"*55}')
    classificador_tfidf = MultinomialNB(alpha=alpha)
    classificador_tfidf.fit(X_tfidf_train, y_train)
    y_pred_tfidf = classificador_tfidf.predict(X_tfidf_test)
    avalia_resultado(y_test, y_pred_tfidf)

In [58]:
# Varia o parâmetro alpha para checar qual o melhor
alphas = np.arange(0.01, 1, 0.2)
alphas

array([0.01, 0.21, 0.41, 0.61, 0.81])

In [59]:
for alpha in alphas:
    classifica_NB_tfidf_alpha(alpha)


-------------------------------------------------------
Naive Bayes - TF-IDF
-------------------------------------------------------
Alpha = 0.01:
-------------------------------------------------------
 Acurácia:	76.92 %
 Matriz de Confusão:
 [[5 1]
 [2 5]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.71      0.83      0.77         6
          Y       0.83      0.71      0.77         7

avg / total       0.78      0.77      0.77        13


-------------------------------------------------------
Naive Bayes - TF-IDF
-------------------------------------------------------
Alpha = 0.21:
-------------------------------------------------------
 Acurácia:	69.23 %
 Matriz de Confusão:
 [[4 2]
 [2 5]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.67      0.67      0.67         6
          Y       0.71      0.71      0.71         7

avg / total       0.69      0.69      0.69  

### SVM

In [60]:
from sklearn.svm import SVC

In [61]:
tipos_svn = ['linear', 'rbf', 'sigmoid']
C = [0.1, 0.3, 0.6, 0.8]

In [62]:
def avalia_svm(descricao, tipo, X_train, y_train, X_test, C):
    print(f'\n{"-"*55}\nSVM - {descricao}\n{"-"*55}')
    print(f'Kernel = {tipo}, C = {C}\n')
    clf = SVC(kernel=tipo, C=C)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    avalia_resultado(y_test, y_pred)

#### SVM - Bag of Words

In [63]:
for tipo in tipos_svn:
    for c in C:
        avalia_svm("Bag of Words", tipo, X_bow_train, y_train, X_bow_test, C=c)


-------------------------------------------------------
SVM - Bag of Words
-------------------------------------------------------
Kernel = linear, C = 0.1

 Acurácia:	76.92 %
 Matriz de Confusão:
 [[6 0]
 [3 4]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.67      1.00      0.80         6
          Y       1.00      0.57      0.73         7

avg / total       0.85      0.77      0.76        13


-------------------------------------------------------
SVM - Bag of Words
-------------------------------------------------------
Kernel = linear, C = 0.3

 Acurácia:	76.92 %
 Matriz de Confusão:
 [[6 0]
 [3 4]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.67      1.00      0.80         6
          Y       1.00      0.57      0.73         7

avg / total       0.85      0.77      0.76        13


-------------------------------------------------------
SVM - Bag of Words
------

  'precision', 'predicted', average, warn_for)


#### SVM - TFIDF

In [64]:
for tipo in tipos_svn:
    for c in C:
        avalia_svm("TF-IDF", tipo, X_tfidf_train, y_train, X_tfidf_test, C=c)


-------------------------------------------------------
SVM - TF-IDF
-------------------------------------------------------
Kernel = linear, C = 0.1

 Acurácia:	53.85 %
 Matriz de Confusão:
 [[0 6]
 [0 7]]


  'precision', 'predicted', average, warn_for)


 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.00      0.00      0.00         6
          Y       0.54      1.00      0.70         7

avg / total       0.29      0.54      0.38        13


-------------------------------------------------------
SVM - TF-IDF
-------------------------------------------------------
Kernel = linear, C = 0.3

 Acurácia:	53.85 %
 Matriz de Confusão:
 [[0 6]
 [0 7]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.00      0.00      0.00         6
          Y       0.54      1.00      0.70         7

avg / total       0.29      0.54      0.38        13


-------------------------------------------------------
SVM - TF-IDF
-------------------------------------------------------
Kernel = linear, C = 0.6

 Acurácia:	76.92 %
 Matriz de Confusão:
 [[5 1]
 [2 5]]
 Relatório de classificação:
              precision    recall  f1-score   support

          

### Random Forest

In [65]:
from numpy.core.umath_tests import inner1d
from sklearn.ensemble import RandomForestClassifier

  """Entry point for launching an IPython kernel.


In [66]:
n_estimadores = [5, 10, 100, 500, 1000]

In [67]:
def avalia_random_forest(descricao, X_train, y_train, X_test, n_est):
    print(f'\n{"-"*60}\nRandom Forest - {descricao}\n{"-"*60}')
    print(f'No estimadores = {n_est}\n')
    classifier = RandomForestClassifier(n_estimators=n_est)
    classifier.fit(X_train, y_train) 
    y_pred = classifier.predict(X_test)
    avalia_resultado(y_test, y_pred)

#### Random Forest - Bag of Words

In [68]:
for n_est in n_estimadores:
    avalia_random_forest('BoW', X_bow_train, y_train, X_bow_test, n_est=n_est)


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 5

 Acurácia:	69.23 %
 Matriz de Confusão:
 [[5 1]
 [3 4]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.62      0.83      0.71         6
          Y       0.80      0.57      0.67         7

avg / total       0.72      0.69      0.69        13


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 10

 Acurácia:	69.23 %
 Matriz de Confusão:
 [[5 1]
 [3 4]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.62      0.83      0.71         6
          Y       0.80      0.57      0.67         7

avg / total       0.72      0.69      0.69        13


------------------------------------------------------------
Random Fo

#### Random Forest -TF-IDF

In [69]:
for n_est in n_estimadores:
    avalia_random_forest('BoW', X_tfidf_train, y_train, X_tfidf_test, n_est=n_est)


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 5

 Acurácia:	61.54 %
 Matriz de Confusão:
 [[5 1]
 [4 3]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.56      0.83      0.67         6
          Y       0.75      0.43      0.55         7

avg / total       0.66      0.62      0.60        13


------------------------------------------------------------
Random Forest - BoW
------------------------------------------------------------
No estimadores = 10

 Acurácia:	61.54 %
 Matriz de Confusão:
 [[5 1]
 [4 3]]
 Relatório de classificação:
              precision    recall  f1-score   support

          N       0.56      0.83      0.67         6
          Y       0.75      0.43      0.55         7

avg / total       0.66      0.62      0.60        13


------------------------------------------------------------
Random Fo

### Cross-Validation

### Resultados