In [1]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib
import numpy as np
import os

In [4]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-34-215-56-46.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
data = pd.read_sql_query("SELECT * FROM nlp_dim_hpc ORDER BY id ASC", conn)


# going to try on a bunch of article bodies without NLP for performance
# data = pd.read_sql_query("SELECT * FROM articles ORDER BY id ASC", conn)

# data = pd.read_csv('nlp_dim_1000.csv')

In [45]:
# data.head()
data.info()
data.to_pickle('nlp_data.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164541 entries, 0 to 164540
Data columns (total 21 columns):
index                 164541 non-null int64
site                  164541 non-null object
title                 164538 non-null object
author                126494 non-null object
published_on          130775 non-null object
accessed_on           164541 non-null datetime64[ns]
url                   164541 non-null object
body                  164541 non-null object
newspaper_keywords    164541 non-null object
newspaper_summary     164541 non-null object
id                    164541 non-null int64
tokenized_body        164541 non-null object
word_count            164541 non-null int64
stopworded_body       164541 non-null object
lemmatized_body       164541 non-null object
word_bag              164541 non-null object
named_entities        164541 non-null object
lexical_diversity     164541 non-null float64
sentiment_score       164541 non-null object
binary_sentiment      164541

In [46]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=200,
                                 min_df=2, use_idf=True)
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(vectorizer, 'model/tf_vectorizer_obj.pkl')

['model/tf_vectorizer_obj.pkl']

In [33]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.lemmatized_body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<164541x200 sparse matrix of type '<class 'numpy.float64'>'
	with 8159976 stored elements in Compressed Sparse Row format>

In [34]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [35]:
sparse.save_npz('model/tf_idf.npz', X)

In [36]:
y = sparse.load_npz('model/tf_idf.npz')
y

<164541x200 sparse matrix of type '<class 'numpy.float64'>'
	with 8159976 stored elements in Compressed Sparse Row format>

In [37]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 15

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [38]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 201702.624
Iteration  1, inertia 119662.967
Iteration  2, inertia 117647.798
Iteration  3, inertia 116885.227
Iteration  4, inertia 115951.690
Iteration  5, inertia 115136.843
Iteration  6, inertia 114799.534
Iteration  7, inertia 114619.729
Iteration  8, inertia 114456.061
Iteration  9, inertia 114368.486
Iteration 10, inertia 114318.238
Iteration 11, inertia 114281.141
Iteration 12, inertia 114261.242
Iteration 13, inertia 114249.458
Iteration 14, inertia 114241.869
Iteration 15, inertia 114235.702
Iteration 16, inertia 114228.850
Iteration 17, inertia 114218.270
Iteration 18, inertia 114200.822
Iteration 19, inertia 114178.279
Iteration 20, inertia 114157.965
Iteration 21, inertia 114144.964
Iteration 22, inertia 114136.708
Iteration 23, inertia 114130.416
Iteration 24, inertia 114125.189
Iteration 25, inertia 114120.984
Iteration 26, inertia 114117.522
Iteration 27, inertia 114114.629
Iteration 28, inertia 114112.113
Iteration 29, inert

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [39]:
# Save kmeans model 
joblib.dump(km, 'model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_
print(terms)

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

['000', '10', '2016', '2017', 'accord', 'add', 'administration', 'advertisement', 'american', 'and', 'another', 'around', 'as', 'ask', 'attack', 'back', 'become', 'begin', 'believe', 'best', 'big', 'business', 'call', 'campaign', 'case', 'change', 'child', 'city', 'close', 'com', 'come', 'company', 'continue', 'could', 'country', 'court', 'day', 'de', 'deal', 'do', 'election', 'end', 'even', 'every', 'face', 'facebook', 'family', 'far', 'feel', 'find', 'first', 'follow', 'for', 'force', 'former', 'game', 'give', 'good', 'google', 'government', 'group', 'he', 'health', 'help', 'high', 'hold', 'home', 'house', 'if', 'image', 'include', 'issue', 'job', 'keep', 'know', 'la', 'last', 'later', 'law', 'le', 'lead', 'leader', 'leave', 'life', 'live', 'long', 'look', 'lot', 'main', 'man', 'many', 'market', 'may', 'mean', 'medium', 'member', 'might', 'million', 'month', 'move', 'mr', 'much', 'name', 'national', 'need', 'never', 'news', 'next', 'north', 'number', 'offer', 'office', 'official', 'o

In [40]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(15):
    for ind in order_centroids[i, :15]:
            print(' %s' % terms[ind], end='')
    print('\n')

 north trump state president us united country official leader security could force attack report administration

 image mr us people use show medium first see work he could world two photo

 google company use home work look play well facebook service see system include still come

 mr main advertisement story continue read york trump sign he president you state photo try

 de la le advertisement 2017 google facebook twitter trump on come 10 image 2016 us

 police attack man report he people tell old two city accord find family video home

 court case state law government trump he right two president use could tell issue accord

 woman she tell story people work report come know write york news think and many

 state government attack people country president official group report party security law trump health american

 game play team first point run he two second three back start last best come

 tax trump plan republican pay house percent president state business 000 company amer

In [41]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
# data

In [42]:
tfidf = TfidfVectorizer(max_features=100)
X_test = tfidf.fit_transform([data.lemmatized_body[160]])

In [43]:
z = km.predict(X_test)
print(z)

ValueError: Incorrect number of features. Got 100 features, expected 200