In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
import nltk
from string import punctuation
from sklearn.decomposition import NMF
import json
from sklearn.metrics.pairwise import cosine_similarity


# Vetorização
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans

In [2]:
stopwords = nltk.corpus.stopwords.words('portuguese')
numbers = '0123456789'

In [3]:
df = pd.read_csv('noticias_globo.csv')
df.tail()

Unnamed: 0,link,img_url,titulo,subtitulo,descricao,doc
17816,https://g1.globo.com/politica/noticia/senado-a...,https://s.glbimg.com/jo/g1/static/live/imagens...,Senado aprova incluir 12 cidades na região int...,"Criada em 1998, região tem o objetivo de artic...",Foram 64 votos a favor da proposta e um único...,senado aprova incluir cidades região integrada...
17817,https://globoesporte.globo.com/futebol/times/c...,https://s2.glbimg.com/Nww_eNqyeLPYpZi1NaIX4nDu...,Carille troca o Corinthians por time da Arábia...,"Anunciado como substituto, Osmar Loss comanda ...",– Ele aceitou a proposta – escreveu Andrés. ...,carille troca corinthians time arábia saudita ...
17818,https://sportv.globo.com/site/programas/copa-2...,https://s2.glbimg.com/DZrhAjOV0BGcouOHP7dDLaKP...,Goleiro titular da Argentina sofre lesão e est...,Romero teve papel decisivo no time vice-campeã...,"Segundo a imprensa argentina, Romero deve pas...",goleiro titular argentina sofre lesão copa rom...
17819,https://g1.globo.com/economia/noticia/operacao...,https://s2.glbimg.com/wzA7nThsl0H7H7D1miRUz1TP...,Anatel apreende mais de 10 mil produtos em açã...,15 empresas foram alvo da fiscalização realiza...,A operação ocorreu em 14 cidades de 7 estados...,anatel apreende 10 mil produtos ação contra pi...
17820,https://g1.globo.com/economia/noticia/desonera...,https://s.glbimg.com/jo/g1/static/live/imagens...,Governo anuncia acordo para zerar um dos tribu...,,"No mesmo anúncio, Guardia também informou que...",governo anuncia acordo zerar tributos sobre di...


In [4]:
df = df.iloc[0:3000]
df.tail()

Unnamed: 0,link,img_url,titulo,subtitulo,descricao,doc
2995,https://g1.globo.com/rj/regiao-serrana/noticia...,https://s2.glbimg.com/bzCmvYDdW8Jo_2NThgafm4sb...,Polícia recolhe novas imagens da pichação de s...,Investigação também ouve testemunhas para tent...,A polícia afirmou que eles negam e se contrad...,polícia recolhe novas imagens pichação suástic...
2996,https://g1.globo.com/pop-arte/noticia/2018/10/...,https://s2.glbimg.com/Ek1UK8e7LH_mE_HmhjmWP4HW...,Artistas tentam salvar complexo projetado por ...,Construção foi interrompida por guerra civil q...,Os artistas libaneses e de outros países ocup...,artistas tentam salvar complexo projetado niem...
2997,https://g1.globo.com/ciencia-e-saude/noticia/2...,https://s2.glbimg.com/6lWviPvAZaNTJyBl9ZFcHkpv...,Missão para Mercúrio tem tecnologia inédita pa...,"Além da sensação de 'entrar em forno', radiaçã...","O objetivo é chegar a Mercúrio, o planeta men...",missão mercúrio tecnologia inédita suportar 42...
2998,https://g1.globo.com/sp/ribeirao-preto-franca/...,https://s2.glbimg.com/deN9EvXu8SKYWgQIsp2F8cL3...,Dupla sertaneja morre em acidente no interior ...,Carro de Fábio e Guilherme invadiu pista contr...,O enterro está previsto para acontecer às 20h...,dupla sertaneja morre acidente interior sp car...
2999,https://g1.globo.com/carros/motos/noticia/2018...,https://s2.glbimg.com/BrnTJ2ti-I_5oujxe5MONb2G...,Scooter 'mais acessível' da Honda chega às loj...,Elite 125 vai concorrer com o Yamaha Neo 125. ...,Honda Elite 125: conheça detalhes do scooter ...,scooter 'mais acessível honda chega lojas deze...


In [5]:
def processa(row):
    txt = row['titulo'] + ' ' + ("" if pd.isnull(row['subtitulo']) else row['subtitulo']) + ' ' + ("" if pd.isnull(row['descricao']) else row['descricao'])
    
    return ' '.join([t for t in word_tokenize(txt.lower()) if (t not in stopwords) and (t not in punctuation + numbers)])

df['doc'] = df.apply(processa, axis=1)

df['doc'].head()

0    médicos divulga lista cidades receberam substi...
1    temer assina reajuste 16,38 ministros stf pres...
2    onu libera r 35,5 milhões ajuda humanitária ve...
3    sonda nasa pousa marte veja 1ª imagem planeta ...
4    cristiana lôbo bolsonaro pretende centralizar ...
Name: doc, dtype: object

In [6]:
vectorizer = TfidfVectorizer(max_features=10000)

tfidf_matrix = vectorizer.fit_transform(df['doc'])
tfidf_matrix

<3000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 412067 stored elements in Compressed Sparse Row format>

In [7]:
df_matrix = pd.DataFrame(tfidf_matrix.todense())

In [8]:
df_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.039125,0.000000,0.000000,0.0,0.000000,0.045466,0.000000,0.000000,0.000000,0.000000
6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.036510,...,0.000000,0.053066,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [9]:
km = KMeans(
    n_clusters=25,
    random_state=1,
    n_jobs=-1
)
df['cluster'] = km.fit_predict(df_matrix)

In [13]:
df[df['cluster'] == 10]['descricao']

27       Outros seis suspeitos permanecem detidos. Ent...
69      A sequência dos fatos e o que ainda não está c...
250      O advogado de Edison, da esposa Cristiana e d...
330      Segundo o IML, Daniel foi morto por causa dos...
465      Nas mensagens, Allana, filha de Edison Britte...
553      A defesa da família Brittes informou que só v...
599     A sequência dos fatos e o que ainda não está c...
633      Nesta segunda-feira (19), o advogado esteve n...
711      Jogador Daniel foi encontrado morto no dia 27...
748      A polícia já prendeu Edison, a mulher e a fil...
934      Segundo Amadeu Trevisan, Eduardo Purkote, de ...
973      O advogado de Eduardo Purkote, Ricardo Dewes,...
1054    A sequência dos fatos e o que ainda não está c...
1171     "[...] Relatando que inclusive o colchão do c...
1217     Segundo o advogado, Eduardo Henrique da Silva...
1357     De acordo com o promotor, o dono do celular u...
1442     "O Edison, quando ele mata o Daniel, ele fica...
1486     Políc

In [11]:
df['cluster']

0        9
1       18
2        2
3        2
4       13
5       23
6       18
7       18
8        5
9        3
10      14
11      15
12       2
13      15
14      18
15       6
16       2
17      18
18      14
19       3
20       1
21       1
22      11
23      11
24      15
25       2
26      14
27      10
28       7
29       2
        ..
2970     1
2971     3
2972     2
2973    24
2974    21
2975     2
2976    22
2977    18
2978     6
2979    22
2980     2
2981    15
2982     2
2983    13
2984    21
2985    22
2986    21
2987     5
2988     2
2989     1
2990     2
2991    22
2992     2
2993    17
2994     2
2995    15
2996     2
2997     2
2998    14
2999     2
Name: cluster, Length: 3000, dtype: int32