In [2]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib
import numpy as np
import os

In [16]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
data = pd.read_sql_query("SELECT * FROM nlp_dim ORDER BY id ASC", conn)


# going to try on a bunch of article bodies without NLP for performance
# data = pd.read_sql_query("SELECT * FROM articles ORDER BY id ASC", conn)

# data = pd.read_csv('nlp_dim_1000.csv')

In [17]:
# data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164498 entries, 0 to 164497
Data columns (total 10 columns):
site                  164498 non-null object
title                 164495 non-null object
author                126451 non-null object
published_on          130732 non-null object
accessed_on           164498 non-null datetime64[ns]
url                   164498 non-null object
body                  164498 non-null object
newspaper_keywords    164498 non-null object
newspaper_summary     164498 non-null object
id                    164498 non-null int64
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 12.6+ MB


In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100,
                                 min_df=2, use_idf=True)
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(vectorizer, 'model/tf_vectorizer_obj.pkl')

['model/tf_vectorizer_obj.pkl']

In [19]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<164498x100 sparse matrix of type '<class 'numpy.float64'>'
	with 5169957 stored elements in Compressed Sparse Row format>

In [20]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [21]:
sparse.save_npz('model/tf_idf.npz', X)

In [22]:
y = sparse.load_npz('model/tf_idf.npz')
y

<164498x100 sparse matrix of type '<class 'numpy.float64'>'
	with 5169957 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 3

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [24]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 224909.159
Iteration  1, inertia 126097.040
Iteration  2, inertia 122803.882
Iteration  3, inertia 121398.927
Iteration  4, inertia 120285.605
Iteration  5, inertia 119855.714
Iteration  6, inertia 119659.509
Iteration  7, inertia 119540.634
Iteration  8, inertia 119517.138
Iteration  9, inertia 119513.315
Iteration 10, inertia 119512.710
Iteration 11, inertia 119512.506
Iteration 12, inertia 119512.446
Converged at iteration 12: center shift 4.437656e-07 within tolerance 8.089938e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [25]:
# Save kmeans model 
joblib.dump(km, 'model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_
print(terms)

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

['000', '2017', 'according', 'against', 'any', 'around', 'back', 'because', 'before', 'being', 'between', 'company', 'continue', 'could', 'day', 'de', 'did', 'do', 'don', 'down', 'during', 'even', 'first', 'game', 'get', 'go', 'going', 'good', 'government', 'her', 'here', 'him', 'home', 'house', 'how', 'know', 'last', 'long', 'made', 'main', 'make', 'many', 'may', 'me', 'million', 'most', 'mr', 'much', 'my', 'news', 'no', 'now', 'off', 'only', 'our', 'over', 'people', 'president', 're', 'reading', 'right', 'say', 'says', 'see', 'she', 'should', 'since', 'state', 'still', 'story', 'such', 'take', 'them', 'then', 'these', 'think', 'those', 'three', 'through', 'times', 'told', 'trump', 'twitter', 'two', 'us', 've', 'very', 'want', 'way', 'week', 'well', 'where', 'while', 'white', 'work', 'world', 'year', 'years', 'york', 'your']
 trump president house white news

 your company people year our

 her she de my people



In [29]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

 trump president house white news

 your company people year our

 her she de my people



In [30]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
# data

In [None]:
tfidf = TfidfVectorizer(max_features=100)
X_test = tfidf.fit_transform([data.lemmatized_body[98]])

In [None]:
z = km.predict(X_test)
print(z)