In [2]:
import pandas as pd
import psycopg2
from sklearn.externals import joblib
import numpy as np
import os

In [3]:
# this will be used when converted to real script to maintain ID ordering when we cluster and label 
# just need to change target table 

conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
data = pd.read_sql_query("SELECT * FROM nlp_dim ORDER BY id ASC", conn)

# data = pd.read_csv('nlp_dim_1000.csv')

In [4]:
# data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1153 entries, 0 to 1152
Data columns (total 20 columns):
index                 1153 non-null int64
site                  1153 non-null object
title                 1153 non-null object
author                831 non-null object
published_on          897 non-null object
accessed_on           1153 non-null object
url                   1153 non-null object
body                  1153 non-null object
newspaper_keywords    1153 non-null object
newspaper_summary     1153 non-null object
id                    1153 non-null int64
tokenized_body        1153 non-null object
word_count            1153 non-null int64
stopworded_body       1153 non-null object
lemmatized_body       1153 non-null object
word_bag              1153 non-null object
named_entities        1153 non-null object
lexical_diversity     1153 non-null float64
sentiment_score       1153 non-null object
binary_sentiment      1153 non-null int64
dtypes: float64(1), int64(4), object(1

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# transforms data into tfidf matrix representation
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100,
                                 min_df=2, use_idf=True)
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(vectorizer, 'model/tf_vectorizer_obj.pkl')

['model/tf_vectorizer_obj.pkl']

In [6]:
# fit our data (list of article bodies) to a tfidf representation
X = vectorizer.fit_transform(data.lemmatized_body)

# verify we have a sparse matrix of 100 tfidf features for each article 
# should be 5*100 sparse matrix
X

<1153x100 sparse matrix of type '<class 'numpy.float64'>'
	with 29458 stored elements in Compressed Sparse Row format>

In [7]:
# Store the data that we have of TFIDF vectors into a file
from scipy import sparse

In [8]:
sparse.save_npz('model/tf_idf.npz', X)

In [9]:
y = sparse.load_npz('model/tf_idf.npz')
y

<1153x100 sparse matrix of type '<class 'numpy.float64'>'
	with 29458 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.cluster import KMeans

# How many clusters we want
true_k = 3

# create the KMeans object with initial settings
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [11]:
# fit our tfidf data to the kmeans model
km.fit(X)

Initialization complete
Iteration  0, inertia 1516.255
Iteration  1, inertia 859.874
Iteration  2, inertia 835.101
Iteration  3, inertia 832.455
Iteration  4, inertia 816.017
Iteration  5, inertia 809.849
Iteration  6, inertia 809.801
Iteration  7, inertia 809.798
Converged at iteration 7: center shift 0.000000e+00 within tolerance 8.069389e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [12]:
# Save kmeans model 
joblib.dump(km, 'model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
labels = km.labels_
print(terms)

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

['20', '2016', '2017', 'and', 'attack', 'au', 'back', 'bar', 'belfast', 'best', 'but', 'caption', 'change', 'city', 'climate', 'come', 'could', 'dans', 'day', 'de', 'dead', 'donald', 'du', 'en', 'et', 'family', 'first', 'for', 'get', 'give', 'go', 'government', 'he', 'hide', 'home', 'house', 'il', 'image', 'in', 'include', 'irish', 'it', 'know', 'la', 'last', 'le', 'leave', 'les', 'like', 'little', 'make', 'march', 'may', 'mcgarry', 'more', 'muldoon', 'new', 'news', 'old', 'people', 'photo', 'police', 'president', 'pub', 'que', 'rabbit', 'report', 'right', 'see', 'show', 'sign', 'state', 'take', 'tell', 'that', 'think', 'this', 'three', 'today', 'trump', 'two', 'u2014', 'u2019', 'u2019s', 'u2019t', 'u201d', 'un', 'up', 'us', 'use', 'want', 'we', 'week', 'well', 'white', 'work', 'world', 'would', 'xe0', 'xe9']
 trump president climate house white

 police attack image people would

 u201d u2019s de sign u2019t



In [13]:
# compare saved and loaded kmeans
kmeans_loaded = joblib.load('model/kmeans_model.pkl')

terms = vectorizer.get_feature_names()
order_centroids = kmeans_loaded.cluster_centers_.argsort()[:, ::-1]
labels = kmeans_loaded.labels_

# order_centroids

for i in range(3):
    for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind], end='')
    print('\n')

 trump president climate house white

 police attack image people would

 u201d u2019s de sign u2019t



In [14]:
# Small test for how we can eventually persist the cluster labels for individual articles
# Since the labels attribute is in the order that the sparse matrix was in when it was passed in
# We should be able just insert the label value as a dataframe column

t = pd.Series(labels)
data['cluster_label'] = t
# data

In [15]:
tfidf = TfidfVectorizer(max_features=100)
X_test = tfidf.fit_transform([data.lemmatized_body[98]])

In [16]:
z = km.predict(X_test)
print(z)

[1]
