In [1]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib
import numpy as np
import os

In [2]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
data = pd.read_sql_query("SELECT * FROM nlp_dim ORDER BY id ASC LIMIT 100", conn)

# data = pd.read_csv('nlp_dim_1000.csv')

In [3]:
# data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
index                 100 non-null int64
site                  100 non-null object
title                 100 non-null object
author                100 non-null object
published_on          100 non-null object
accessed_on           100 non-null object
url                   100 non-null object
body                  100 non-null object
newspaper_keywords    100 non-null object
newspaper_summary     100 non-null object
id                    100 non-null int64
tokenized_body        100 non-null object
word_count            100 non-null int64
stopworded_body       100 non-null object
lemmatized_body       100 non-null object
word_bag              100 non-null object
named_entities        100 non-null object
lexical_diversity     100 non-null float64
sentiment_score       100 non-null object
binary_sentiment      100 non-null int64
dtypes: float64(1), int64(4), object(15)
memory usage: 15.7

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100,
                                 min_df=2, use_idf=True)
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(vectorizer, 'model/tf_vectorizer_obj.pkl')

['model/tf_vectorizer_obj.pkl']

In [5]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.lemmatized_body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<100x100 sparse matrix of type '<class 'numpy.float64'>'
	with 3667 stored elements in Compressed Sparse Row format>

In [6]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [7]:
sparse.save_npz('model/tf_idf.npz', X)

In [8]:
y = sparse.load_npz('model/tf_idf.npz')
y

<100x100 sparse matrix of type '<class 'numpy.float64'>'
	with 3667 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 3

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [10]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 0.000
Converged at iteration 0: center shift 7.813979e-31 within tolerance 6.586000e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [11]:
# Save kmeans model 
joblib.dump(km, 'model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_
print(terms)

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

['20', '44', 'around', 'award', 'back', 'bar', 'belfast', 'best', 'black', 'bt1', 'build', 'but', 'caption', 'ceiling', 'cellars', 'center', 'church', 'co', 'cocktail', 'crawl', 'create', 'crown', 'district', 'drink', 'duke', 'first', 'five', 'food', 'garrick', 'get', 'good', 'guinness', 'hand', 'hide', 'host', 'hotel', 'in', 'include', 'international', 'ireland', 'irish', 'island', 'jack', 'keith', 'kelly', 'lane', 'like', 'live', 'london', 'lot', 'maddens', 'mcgarry', 'mean', 'menu', 'merchant', 'muldoon', 'museum', 'music', 'newly', 'northern', 'offer', 'open', 'owner', 'palmer', 'parliament', 'pc', 'photo', 'picture', 'pint', 'pub', 'pubs', 'rabbit', 'room', 'run', 'sale', 'sean', 'shamrock', 'shop', 'short', 'sort', 'spaniard', 'spot', 'st', 'starred', 'stuff', 'take', 'thing', 'tour', 'tourist', 'traditional', 'travel', 'try', 'turf', 'watch', 'welcome', 'well', 'when', 'whiskey', 'york', 'young']
 palmer keith pc young hotel

 belfast photo best pub bar

 london watch parliament

In [13]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

 palmer keith pc young hotel

 belfast photo best pub bar

 london watch parliament hotel five



In [14]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
# data

In [15]:
tfidf = TfidfVectorizer(max_features=100)
X_test = tfidf.fit_transform([data.lemmatized_body[98]])

In [16]:
z = km.predict(X_test)
print(z)

[1]
