In [1]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib

In [2]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

# conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
# data = pd.read_sql_query("SELECT * FROM articles ORDER BY id ASC LIMIT 100", conn)

data = pd.read_csv('nlp_dim_1000.csv')

In [3]:
# data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
Unnamed: 0            1000 non-null int64
site                  1000 non-null object
title                 1000 non-null object
author                678 non-null object
published_on          781 non-null object
accessed_on           1000 non-null object
url                   1000 non-null object
body                  1000 non-null object
newspaper_keywords    1000 non-null object
newspaper_summary     1000 non-null object
id                    1000 non-null int64
tokenized_body        1000 non-null object
word_count            1000 non-null int64
stopworded_body       1000 non-null object
lemmatized_body       1000 non-null object
word_bag              1000 non-null object
named_entities        1000 non-null object
lexical_diversity     1000 non-null float64
sentiment_score       1000 non-null object
binary_sentiment      1000 non-null int64
dtypes: float64(1), int64(4), object(15

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100,
                                 min_df=2, use_idf=True)
joblib.dump(vectorizer, 'tf_vectorizer_obj.pkl')

['tf_vectorizer_obj.pkl']

In [5]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.lemmatized_body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<1000x100 sparse matrix of type '<class 'numpy.float64'>'
	with 27049 stored elements in Compressed Sparse Row format>

In [6]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [7]:
sparse.save_npz('tf_idf.npz', X)

In [8]:
y = sparse.load_npz('tf_idf.npz')
y

<1000x100 sparse matrix of type '<class 'numpy.float64'>'
	with 27049 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 5

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [10]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 1029.941
Iteration  1, inertia 603.514
Iteration  2, inertia 589.279
Iteration  3, inertia 587.392
Iteration  4, inertia 586.830
Iteration  5, inertia 586.450
Iteration  6, inertia 586.277
Iteration  7, inertia 586.261
Converged at iteration 7: center shift 0.000000e+00 within tolerance 7.923267e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [11]:
# Save kmeans model 
joblib.dump(km, 'kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_

# order_centroids

for i in range(4):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

 u2014 2017 march u201d twitter

 trump house president white u201d

 image get it people take

 de le la xe9 xe0



In [12]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(4):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

 u2014 2017 march u201d twitter

 trump house president white u201d

 image get it people take

 de le la xe9 xe0



In [13]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
data

Unnamed: 0.1,Unnamed: 0,site,title,author,published_on,accessed_on,url,body,newspaper_keywords,newspaper_summary,...,tokenized_body,word_count,stopworded_body,lemmatized_body,word_bag,named_entities,lexical_diversity,sentiment_score,binary_sentiment,cluster_label
0,0,Breitbart,"Trump: ‘We’re Doing Very Well in Iraq,’ U.S. T...",John Hayward,2017-03-29,2017-03-31 08:20:09.478760,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER On Tuesday, Presid...","{war,told,secretary,soldiers,taking,iraq,presi...","SIGN UP FOR OUR NEWSLETTEROn Tuesday, Presiden...",...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",595,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'iraq', 8), (u'troop', 6), (u'fight', 5), (...","[(u'Trump', u'PERSON'), (u'White House', u'ORG...",69.461078,"{'neg': 0.06307142857142858, 'neu': 0.83049999...",1,4
1,1,Breitbart,Top U.S. General: ‘We Have Not Relaxed the Rul...,Edwin Mora,2017-03-29,2017-03-31 08:20:14.741313,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER WASHINGTON, D.C. —...","{gen,rules,responsibility,engagement,military,...",We have not relaxed the rules of engagement.\n...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",831,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'mosul', 10), (u'civilian', 9), (u'gen.', 8...","[(u'WASHINGTON', u'LOCATION'), (u'D.C.', u'LOC...",69.722814,"{'neg': 0.07838095238095237, 'neu': 0.81557142...",0,4
2,2,Breitbart,Protests in Paris Continue for Third Night Aft...,Oliver Jj Lane,2017-03-30,2017-03-31 08:20:18.922523,http://www.breitbart.com/london/2017/03/30/pro...,SIGN UP FOR OUR NEWSLETTER Hundreds of “Asian...,"{night,protest,paris,killed,france,chinese,sub...",SIGN UP FOR OUR NEWSLETTERHundreds of “Asians”...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",528,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'police', 13), (u'protest', 8), (u'chinese'...","[(u'Paris', u'LOCATION'), (u'Paris', u'LOCATIO...",76.816609,"{'neg': 0.1701875, 'neu': 0.7961875, 'pos': 0....",0,4
3,3,Breitbart,Rep. Jim Jordan: Working with Dems on Health C...,Dan Riehl,2017-03-30,2017-03-31 08:20:25.141342,http://www.breitbart.com/radio/2017/03/30/rep-...,SIGN UP FOR OUR NEWSLETTER Rep. Jim Jordan (R...,"{come,jim,healthcare,reform,werent,working,mis...",SIGN UP FOR OUR NEWSLETTERRep. Jim Jordan (R-O...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",451,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'jordan', 8), (u'get', 4), (u'come', 4), (u...","[(u'Jim Jordan', u'PERSON'), (u'Breitbart News...",73.255814,"{'neg': 0.02517647058823529, 'neu': 0.93682352...",0,4
4,4,Breitbart,John McCain in Last-Minute Attempt to Avert Go...,Ian Mason,2017-03-30,2017-03-31 08:20:29.304708,http://www.breitbart.com/big-government/2017/0...,SIGN UP FOR OUR NEWSLETTER Sen. John McCain (...,"{constitutional,john,supreme,court,attempt,mcc...",SIGN UP FOR OUR NEWSLETTERSen. John McCain (R-...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",405,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'sen.', 5), (u'senate', 5), (u'thursday', 4...","[(u'John McCain', u'PERSON'), (u'Judge Neil Go...",75.000000,"{'neg': 0.06441666666666666, 'neu': 0.81483333...",1,4
5,5,Breitbart,NFL Commissioner Goodell Affirms Gambling Ban ...,Warner Todd Huston,2017-03-30,2017-03-31 08:20:35.563060,http://www.breitbart.com/sports/2017/03/30/nfl...,SIGN UP FOR OUR NEWSLETTER Despite the recent...,"{nfl,ban,betting,despite,affirms,leagues,city,...",SIGN UP FOR OUR NEWSLETTERDespite the recent m...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",478,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'sport', 8), (u'gamble', 6), (u'city', 5), ...","[(u'Raiders', u'ORGANIZATION'), (u'California'...",72.759857,"{'neg': 0.038086956521739136, 'neu': 0.8719999...",1,4
6,6,Breitbart,Suspect Charged for Carrying Pistol Without Pe...,Awr Hawkins,2017-03-30,2017-03-31 08:20:39.720498,http://www.breitbart.com/big-government/2017/0...,SIGN UP FOR OUR NEWSLETTER Officers were book...,"{permit,falls,officer,stolen,buttocks,gun,fell...",SIGN UP FOR OUR NEWSLETTEROfficers were bookin...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",240,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'robert', 8), (u'pistol', 5), (u'officer', ...","[(u'Jesse O\u2019Neal Roberts', u'PERSON'), (u...",77.622378,"{'neg': 0.05, 'neu': 0.9159, 'pos': 0.0341, 'c...",0,4
7,7,Breitbart,Breitbart News Daily: Bipartisan Appeal,Breitbart News,2017-03-30,2017-03-31 08:20:42.994078,http://www.breitbart.com/radio/2017/03/30/brei...,SIGN UP FOR OUR NEWSLETTER On the Friday edit...,"{war,live,appeal,trumps,breitbart,scientific,p...",SIGN UP FOR OUR NEWSLETTEROn the Friday editio...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",258,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'news', 6), (u'breitbart', 6), (u'discus', ...","[(u'Breitbart News Daily', u'ORGANIZATION'), (...",74.137931,"{'neg': 0.025333333333333333, 'neu': 0.9492222...",0,1
8,8,Breitbart,Roger Goodell Doesn’t Think Kaepernick Is Bein...,Daniel Leberfeld,2017-03-29,2017-03-31 08:20:52.387158,http://www.breitbart.com/sports/2017/03/29/rog...,SIGN UP FOR OUR NEWSLETTER PHOENIX – On Tuesd...,"{feels,nfl,doesnt,quarterback,kaepernick,teams...",SIGN UP FOR OUR NEWSLETTERPHOENIX – On Tuesday...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",469,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'team', 6), (u'quarterback', 6), (u'kaepern...","[(u'PHOENIX', u'LOCATION'), (u'ESPN', u'ORGANI...",74.632353,"{'neg': 0.061439999999999995, 'neu': 0.80176, ...",1,4
9,9,Breitbart,Rep. Jim Jordan: Working with Dems on Health C...,Dan Riehl,2017-03-30,2017-03-31 08:20:57.552387,http://www.breitbart.com/radio/2017/03/30/rep-...,SIGN UP FOR OUR NEWSLETTER Rep. Jim Jordan (R...,"{come,jim,healthcare,reform,werent,working,mis...",SIGN UP FOR OUR NEWSLETTERRep. Jim Jordan (R-O...,...,"[u'SIGN', u'UP', u'FOR', u'OUR', u'NEWSLETTER'...",451,"[u'sign', u'up', u'for', u'our', u'newsletter'...","[u'sign', u'up', u'for', u'our', u'newsletter'...","[(u'jordan', 8), (u'get', 4), (u'come', 4), (u...","[(u'Jim Jordan', u'PERSON'), (u'Breitbart News...",73.255814,"{'neg': 0.02517647058823529, 'neu': 0.93682352...",0,4


In [14]:
tfidf = TfidfVectorizer(max_features=100)
X_test = tfidf.fit_transform([data.lemmatized_body[98]])

In [15]:
z = km.predict(X_test)
print(z)

[1]
