# Text Mining - Clustering usign `K-Means`

In [1]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import numpy as np

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


### 1. Load the corpus of texts

In [2]:
# 1. Load the corpus of texts
print('1. Load the corpus of texts')
trainCorpus = ["Me gustan las vacas",
               "Me gustan los caballos",
               "odio los perros",0
               "odio los caballos",
               "me gustan las ranas",
               "me gusta el helado",
               "no quiero comer"]

for text in trainCorpus :
    print('\t', text)

1. Load the corpus of texts
	 Me gustan las vacas
	 Me gustan los caballos
	 odio los perros
	 odio los caballos
	 me gustan las ranas
	 me gusta el helado
	 no quiero comer


### 2. Texts vectorization 

In [3]:
# 2. Texts vectorization
print('2. Texts vectorization\n')

# Import the vectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the vectorizer
vectorizer = CountVectorizer()

# Use the vectorizer to transform the documents on a matrix of tf's (term frequency) of documents
vectorizer.fit(trainCorpus)

# Print the features of vectorizer
print('Features: ', vectorizer.get_feature_names())

# Extract the terms frequency
tfMatrix = vectorizer.transform(trainCorpus)

# Print the matrix. 
# This matrix converted in array indicates: 
#    - Each column is a one feature, 
#    - Each row is a one sentence of corpus.
#    - The (i,j) value indicates the frequency of j feature in a i sentence
print('\ntf Matrix:\n', tfMatrix.toarray())

# Print the shape of our matrix:
print("number of sentences %d, number of features %d" % tfMatrix.shape)

2. Texts vectorization

Features:  ['caballos', 'comer', 'el', 'gusta', 'gustan', 'helado', 'las', 'los', 'me', 'no', 'odio', 'perros', 'quiero', 'ranas', 'vacas']

tf Matrix:
 [[0 0 0 0 1 0 1 0 1 0 0 0 0 0 1]
 [1 0 0 0 1 0 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 0]
 [1 0 0 0 0 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 1 0 1 0 0 0 0 1 0]
 [0 0 1 1 0 1 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 0 1 0 0]]
number of sentences 7, number of features 15


### 3. Texts clustering 

In [4]:
# 3. Texts clustering
print('3. Texts clustering')

# Import the KMeans algorithm
from sklearn.cluster import KMeans

# Assign the kmeans's k-value to k=2 because we want to get 2 clusters: positive and negative sentences 
k = 2

# Create and fit the KMeans
km = KMeans(n_clusters = k, max_iter = 10, random_state = 1)
km.fit(tfMatrix)

# Print the results:
print('Clusters: ', km.labels_)

# Print the clusters with its sentences:
# For each cluster
for i in range(k) :
    print('Cluster ', i)
    
    # For each sentence
    for j in range(km.labels_.size) :
        if km.labels_[j] == i :
            print('\t', trainCorpus[j])

3. Texts clustering
Clusters:  [0 0 1 1 0 0 1]
Cluster  0
	 Me gustan las vacas
	 Me gustan los caballos
	 me gustan las ranas
	 me gusta el helado
Cluster  1
	 odio los perros
	 odio los caballos
	 no quiero comer


### 4. Clustering quality measurement

In [5]:
# 4. Clustering quality measurement
print('4. Clustering quality measurement')

# Import the metrics
from sklearn import metrics

# To measure the clustering quality, we need the real categories: 1 - negative sentences, 0 - positive sentences
sentencesTruth = [0,0,1,1,0,0,1]

print("Clusters: ", km.labels_)
print("Sentences: ", sentencesTruth)

# Calculate the clustering goodness with: homogeneity_score, completeness_score and v_measure_score
# A cluster is homogeneous if its all elements contains members of the same class
print("Homogeneity: %0.3f" % metrics.homogeneity_score(sentencesTruth, km.labels_))

# A class is compelete if its all elements belong to the same cluster
print("Completeness: %0.3f" % metrics.completeness_score(sentencesTruth, km.labels_))

# V-measure is the weighted average of the last two metrics
print("V-measure: %0.3f" % metrics.v_measure_score(sentencesTruth, km.labels_))

4. Clustering quality measurement
Clusters:  [0 0 1 1 0 0 1]
Sentences:  [0, 0, 1, 1, 0, 0, 1]
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000


### 5. Use the trained KMeans to classify others texts 

In [6]:
# 5. Use the trained KMeans to classify others texts
print('5. Use the trained KMeans to classify others texts')

testCorpus = ['odio los animales', 'me gustan los pájaros']

print('\nTesting tests:')
for text in testCorpus :
    print('\t', text)
    
# Text vectorization
tfMatrixTest = vectorizer.transform(testCorpus)
print('\ntfMatrixTest:\n', tfMatrixTest.toarray())

# Classify the texts
pred = km.predict(tfMatrixTest)
print('\nClusters prediction: ', pred)

5. Use the trained KMeans to classify others texts

Testing tests:
	 odio los animales
	 me gustan los pájaros

tfMatrixTest:
 [[0 0 0 0 0 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 1 1 0 0 0 0 0 0]]

Clusters prediction:  [1 0]
