In [2]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib
import numpy as np
import os

In [4]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

# conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-34-215-56-46.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
# data = pd.read_sql_query("SELECT * FROM nlp_dim_hpc ORDER BY id ASC", conn)


# going to try on a bunch of article bodies without NLP for performance
# data = pd.read_sql_query("SELECT * FROM articles ORDER BY id ASC", conn)

# data = pd.read_csv('nlp_dim_1000.csv')
data = pd.read_pickle('nlp_data.pkl')

In [5]:
# data.head()
data.info()
# data.to_pickle('nlp_data.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164541 entries, 0 to 164540
Data columns (total 21 columns):
index                 164541 non-null int64
site                  164541 non-null object
title                 164538 non-null object
author                126494 non-null object
published_on          130775 non-null object
accessed_on           164541 non-null datetime64[ns]
url                   164541 non-null object
body                  164541 non-null object
newspaper_keywords    164541 non-null object
newspaper_summary     164541 non-null object
id                    164541 non-null int64
tokenized_body        164541 non-null object
word_count            164541 non-null int64
stopworded_body       164541 non-null object
lemmatized_body       164541 non-null object
word_bag              164541 non-null object
named_entities        164541 non-null object
lexical_diversity     164541 non-null float64
sentiment_score       164541 non-null object
binary_sentiment      164541

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=num_features,
                                 min_df=2, use_idf=True)
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(vectorizer, 'model/tf_vectorizer_obj.pkl')

['model/tf_vectorizer_obj.pkl']

In [19]:
# define num features
num_features = 250

In [7]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.lemmatized_body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<164541x250 sparse matrix of type '<class 'numpy.float64'>'
	with 9477063 stored elements in Compressed Sparse Row format>

In [8]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [9]:
sparse.save_npz('model/tf_idf.npz', X)

In [10]:
y = sparse.load_npz('model/tf_idf.npz')
y

<164541x250 sparse matrix of type '<class 'numpy.float64'>'
	with 9477063 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 15

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [12]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 207612.747
Iteration  1, inertia 122592.830
Iteration  2, inertia 119837.364
Iteration  3, inertia 119053.751
Iteration  4, inertia 118729.593
Iteration  5, inertia 118592.674
Iteration  6, inertia 118531.077
Iteration  7, inertia 118501.895
Iteration  8, inertia 118485.052
Iteration  9, inertia 118470.576
Iteration 10, inertia 118457.299
Iteration 11, inertia 118444.748
Iteration 12, inertia 118430.881
Iteration 13, inertia 118414.126
Iteration 14, inertia 118389.942
Iteration 15, inertia 118353.273
Iteration 16, inertia 118293.852
Iteration 17, inertia 118211.742
Iteration 18, inertia 118146.088
Iteration 19, inertia 118110.996
Iteration 20, inertia 118087.749
Iteration 21, inertia 118064.853
Iteration 22, inertia 118029.723
Iteration 23, inertia 117960.304
Iteration 24, inertia 117925.319
Iteration 25, inertia 117917.725
Iteration 26, inertia 117915.409
Iteration 27, inertia 117914.602
Iteration 28, inertia 117914.209
Iteration 29, inert

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [13]:
# Save kmeans model 
joblib.dump(km, 'model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_
print(terms)

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

['000', '10', '2016', '2017', 'accord', 'add', 'administration', 'advertisement', 'allow', 'already', 'american', 'and', 'another', 'apple', 'around', 'as', 'ask', 'at', 'attack', 'away', 'back', 'base', 'become', 'begin', 'believe', 'best', 'big', 'bill', 'bring', 'build', 'business', 'call', 'campaign', 'car', 'case', 'change', 'child', 'city', 'claim', 'close', 'com', 'come', 'company', 'continue', 'could', 'country', 'court', 'create', 'data', 'day', 'de', 'deal', 'department', 'different', 'director', 'do', 'election', 'end', 'even', 'every', 'expect', 'face', 'facebook', 'family', 'far', 'federal', 'feel', 'find', 'fire', 'first', 'five', 'follow', 'for', 'force', 'former', 'four', 'game', 'give', 'good', 'google', 'government', 'great', 'group', 'happen', 'he', 'head', 'health', 'help', 'high', 'hold', 'home', 'house', 'if', 'image', 'include', 'information', 'issue', 'job', 'keep', 'kill', 'know', 'la', 'last', 'later', 'law', 'le', 'lead', 'leader', 'least', 'leave', 'less', '

In [48]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(15):
    print(i)
    for ind in order_centroids[i, :]:
            #print(i)
            print(' %s' % terms[ind], end='')
    print('\n')

0
 2017 show star photo twitter image first may video season com share post day two news last see 10 include come look story team week this he follow world tell today report back us we play home people million release night use family know find old best could medium accord work city 000 well month live three read write next second call watch and even around top car close start give end name still fire life on open york way since set for war high want man leave she run many you part right add statement place much four long big another help thing that child good if 2016 think expect as service become facebook house change line move begin to offer need plan there lead system school game deal phone continue point five head great former american national little white later turn they company view number woman sign though bring up keep away never member put face do pay every try create support country group ask build mean claim talk happen really already base might hold state far when what se

 main advertisement story continue read mr york up sign you photo try offer view service product people later state trump he american work many could two city even know we and president company day percent united use last find world government first come include think write way she show school country they much group call house see want need life woman may become well 000 right still around every back family change that child long start official million help health week there this give play part law what look live another three home tell month leave seem program if place white market public open good news case begin political might add lead move pay run high end never thing when court us national member party attack though ask today report force point business plan set job since republican team police old less war turn feel without name as something policy for on car system money former game little power keep at issue administration best office department really accord leader far numbe

In [15]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
# data

In [22]:
tfidf = TfidfVectorizer(max_features=num_features)
X_test = tfidf.fit_transform([data.lemmatized_body[200]])

In [23]:
z = km.predict(X_test)
print(z)

ValueError: Incorrect number of features. Got 126 features, expected 250

In [26]:
data.sort_values('cluster_label')

Unnamed: 0,index,site,title,author,published_on,accessed_on,url,body,newspaper_keywords,newspaper_summary,...,tokenized_body,word_count,stopworded_body,lemmatized_body,word_bag,named_entities,lexical_diversity,sentiment_score,binary_sentiment,cluster_label
62246,250,FOX,Tourists keep returning 'cursed' lava rocks to...,,2017-05-16,2017-05-17 10:48:36.532297,http://www.foxnews.com/travel/2017/05/16/touri...,"It may seem like a harmless souvenir, tucking ...","{visitors,national,tourists,hawaii,legend,volc...","It may seem like a harmless souvenir, tucking ...",...,"{It,may,seem,like,a,harmless,souvenir,"","",tuck...",345,"{it,may,seem,like,harmless,souvenir,tucking,la...","{it,may,seem,like,harmless,souvenir,tuck,lava,...","{""(``,7)"",""('',7)"",""(rock,6)"",""(park,5)"",""(ret...","{""(\""Big Island\"",LOCATION)"",""(Pele,PERSON)"",""...",79.000000,"{""neg"": 0.10356249999999999, ""neu"": 0.8581875,...",0,0
81906,67,WashingtonPost,Ariana Grande announces ‘One Love Manchester’ ...,Caitlin Gibson Is A Feature Writer At The Wash...,2017-05-30,2017-05-31 09:42:00.840924,https://www.washingtonpost.com/news/arts-and-e...,Ariana Grande performs on NBC’s “Today” show...,"{grande,katy,benefit,announces,love,bieber,liv...",Ariana Grande performs on NBC’s “Today” show i...,...,"{Ariana,Grande,performs,on,NBC,’,s,“,Today,”,s...",337,"{ariana,grande,performs,nbc,’,“,today,”,show,2...","{ariana,grande,performs,nbc,’,“,today,”,show,2...","{""(“,7)"",""(”,7)"",""(grande,7)"",""(manchester,6)""...","{""(NBC,ORGANIZATION)"",""(\""New York\"",LOCATION)...",68.393782,"{""neg"": 0.05311111111111111, ""neu"": 0.80666666...",1,0
81911,72,WashingtonPost,D.C.-area forecast: More showers possible toda...,Dan Stillman Is A Meteorologist,2017-05-31,2017-05-31 09:42:29.051869,https://www.washingtonpost.com/news/capital-we...,TODAY’S DAILY DIGIT A somewhat subjective rat...,"{chance,possible,low,forecast,storms,tomorrow,...",5/10: Same old story with more showers possibl...,...,"{TODAY,’,S,DAILY,DIGIT,A,somewhat,subjective,r...",532,"{today,’,s,daily,digit,a,somewhat,subjective,r...","{today,’,s,daily,digit,a,somewhat,subjective,r...","{""(:,17)"",""(low,11)"",""(shower,11)"",""(confidenc...","{""(\""The Washington Post\"",ORGANIZATION)"",""(Fa...",58.997050,"{""neg"": 0.07734374999999999, ""neu"": 0.78309374...",1,0
126119,92,BusinessInsider,"'It's not you, it's us': Little Rock, Arkansas...",Andrew Demillo,2017-10-19,2017-10-20 13:40:35.001360,http://www.businessinsider.com/ap-little-rock-...,Amazon is accepting bids from cities to host i...,"{ad,rock,city,arkansas,little,amazons,chamber,...","Little Rock, Arkansas, announced in an ad Thur...",...,"{Amazon,is,accepting,bids,from,cities,to,host,...",478,"{amazon,accepting,bids,cities,host,second,head...","{amazon,accept,bid,city,host,second,headquarte...","{""(amazon,12)"",""(city,11)"",""(rock,7)"",""(little...","{""(Amazon,ORGANIZATION)"",""(Arkansas,LOCATION)""...",63.909774,"{""neg"": 0.008473684210526316, ""neu"": 0.9254736...",1,0
29425,93,Arstechnica,The Expanse season two finale: “It’s part of t...,Jonathan M. Gitlin,,2017-04-25 13:08:52.496126,https://arstechnica.co.uk/gaming/2017/04/the-e...,"It's been a fun 13 weeks, but the second seaso...","{ship,season,crew,protomolecule,expanse,scene,...","It's been a fun 13 weeks, but the second seaso...",...,"{It,'s,been,a,fun,13,weeks,"","",but,the,second,...",532,"{it,fun,13,weeks,second,season,the,expanse,fin...","{it,fun,13,week,second,season,the,expanse,fina...","{""(i,8)"",""(subscribe,5)"",""(season,5)"",""(us,5)""...","{""(Caliban,PERSON)"",""(Babylon,LOCATION)"",""(\""A...",76.515152,"{""neg"": 0.040952380952380955, ""neu"": 0.8953333...",1,0
139377,328,Guardian,"Where can an owner move their football club 1,...",Jakub Frankowicz,2017-10-26,2017-10-27 09:43:51.228820,https://www.theguardian.com/football/blog/2017...,Coming off the United States’ failure to quali...,"{america,club,precourt,city,fans,whim,1000,sta...",Coming off the United States’ failure to quali...,...,"{Coming,off,the,United,States,’,failure,to,qua...",1564,"{coming,united,states,’,failure,qualify,2018,w...","{come,united,state,’,failure,qualify,2018,worl...","{""(’,33)"",""(move,16)"",""(the,15)"",""(ml,14)"",""(t...","{""(\""United States\"",LOCATION)"",""(\""Major Leag...",63.720930,"{""neg"": 0.040315789473684215, ""neu"": 0.8589122...",1,0
119878,364,USAToday,Selena Quintanilla: Google Doodle honors singe...,Brett Molina,2017-10-17,2017-10-18 11:39:25.928392,http://www.usatoday.com/story/tech/talkingtech...,Skip in Skip x Embed x Share CLOSE Selena Quin...,"{star,google,music,quintanilla,debut,x,wrote,s...",Skip in Skip x Embed x Share CLOSE Selena Quin...,...,"{Skip,in,Skip,x,Embed,x,Share,CLOSE,Selena,Qui...",356,"{skip,skip,x,embed,x,share,close,selena,quinta...","{skip,skip,x,embed,x,share,close,selena,quinta...","{""(selena,10)"",""(first,4)"",""(music,4)"",""(campo...","{""(\""Selena Quintanilla-Pérez\"",PERSON)"",""(Goo...",76.699029,"{""neg"": 0.012374999999999999, ""neu"": 0.8358750...",1,0
81923,84,WashingtonPost,‘King Lear’ with Rick Foucheux is a family mel...,First Post Byline,2017-05-30,2017-05-31 09:43:56.622581,https://www.washingtonpost.com/entertainment/t...,The cracking kingdom is not a particularly big...,"{rick,foucheux,henley,design,king,meltdown,reg...",The cracking kingdom is not a particularly big...,...,"{The,cracking,kingdom,is,not,a,particularly,bi...",842,"{the,cracking,kingdom,particularly,big,deal,“,...","{the,cracking,kingdom,particularly,big,deal,“,...","{""(’,22)"",""(foucheux,10)"",""(lear,10)"",""(play,6...","{""(\""Rick Foucheux\"",PERSON)"",""(Shakespeare,PE...",77.400000,"{""neg"": 0.07176666666666667, ""neu"": 0.85643333...",1,0
139373,324,Guardian,Megyn Kelly ditched politics for morning telev...,Jake Nevins,2017-10-26,2017-10-27 09:43:25.886982,https://www.theguardian.com/media/2017/oct/26/...,"One year ago, Megyn Kelly was on top of the wo...","{ditched,fox,host,television,trumps,nbc,mistak...","One year ago, Megyn Kelly was on top of the wo...",...,"{One,year,ago,"","",Megyn,Kelly,was,on,top,of,th...",1375,"{one,year,ago,megyn,kelly,top,world,her,nightl...","{one,year,ago,megyn,kelly,top,world,her,nightl...","{""(’,37)"",""(kelly,30)"",""(one,11)"",""(morning,10...","{""(\""Megyn Kelly\"",PERSON)"",""(\""Fox News\"",ORG...",66.310160,"{""neg"": 0.06487234042553192, ""neu"": 0.85585106...",1,0
29416,84,Arstechnica,Silicon Valley season 4 starts by teetering on...,Joe Mullin,,2017-04-25 13:08:11.093454,https://arstechnica.com/gaming/2017/04/silicon...,Stop me if you've heard this one before: on Si...,"{silicon,richard,repetition,season,seasons,com...",Further Reading Silicon Valley turns dark as i...,...,"{Stop,me,if,you,'ve,heard,this,one,before,:,on...",798,"{stop,heard,one,:,silicon,valley,show,main,cha...","{stop,heard,one,:,silicon,valley,show,main,cha...","{""(``,11)"",""('',11)"",""(richard,9)"",""(the,8)"",""...","{""(\""Silicon Valley\"",LOCATION)"",""(\""Richard H...",70.319635,"{""neg"": 0.07412820512820516, ""neu"": 0.81594871...",1,0


In [35]:
clusters = {}
for i, row in data.iterrows():
    if row['cluster_label'] not in clusters:
        #print(row['body'])
        #print(row['url'])
        #print(row['id'])
        #print(row['title'])
        # add empty list for IDs for a preview 
        clusters[row['cluster_label']] = []

In [37]:
for cluster_val in clusters:
    for i, row in data.iterrows():
        if row['cluster_label'] is cluster_val:
            clusters[cluster_val].append(row['title'])

In [49]:

for cluster in clusters:
    print('\n')
    print(cluster)
    print(clusters[cluster][:25])
    print('\n')



13
['London shows the challenge of preventing low-tech terror', "How To Help London Terrorist Attack Victims' Families", 'London attack: Raids across UK as Westminster attacker identified', 'July 7 2005 London Bombings Fast Facts', 'London unites against terror in defiant vigil', 'London attack: Khalid Masood', 'Tennessee Amber Alert: Ex-teacher accused of kidnapping reportedly spotted in Texas', 'Off-duty Indiana trooper saves couple from fire at home', 'Illegal Immigrant Accused of Murdering Teacher', 'Westminster Attack Victim Aysha Frade ‘Was Picking Up Her Children from School’ when Terrorist Took Her Life', 'Police Constable and Army Veteran Keith Palmer Was ‘Every Inch a Hero’', 'London Terror Killer Named as Convicted Criminal Khalid Masood', 'Italy: ‘North African’ Arrested After Driving at Police, Stabbing Officer', 'Reports: Israeli-American Teen Arrested for Antisemitic Hate Crime Hoax Spree', 'Westminster Attack Victim Aysha Frade ‘Was Picking Up Her Children from School