# Before Running
Add files health.txt, bags.csv and word2vec.csv to the health-dataset folder.

# Introduction
Here, we'll explore k-means clustering algorithm applied to health news in twitter.

In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import pairwise_distances
from kmedoids import kMedoids

In [2]:
def load_health_data():
    return pd.read_csv('health-dataset/health.txt', sep='|')
def load_word2vec_data():
    return pd.read_csv('health-dataset/word2vec.csv')
def load_bags_data():
    return pd.read_csv('health-dataset/bags.csv')

First, we load our data and set the amount of clusters for K-means clustering:

In [3]:
word2vec_health_df = load_word2vec_data()
bags_health_df = load_bags_data()
clusters_amount=100

# First attempt at word2vec
First we'll get the results using the given word2vec.
Using KMeans we have:

In [9]:
kmeans = KMeans(n_clusters=2).fit(word2vec_health_df)
labels = kmeans.labels_
silhouette_score(word2vec_health_df, labels, metric='euclidean')

2 0.039639786483511505


In [4]:
D = pairwise_distances(word2vec_health_df, metric='euclidean')
kMedoids(D, clusters_amount)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

(array([  199,   235,   267,   288,   321,   424,   502,   652,  3627,
         1186,  1394,  1395,  1408,  1427,  1508,  6290,  1673,  3152,
         1915,  8130,  2481,  2702,  2881,   994,  3154,  3417,  3548,
         3596,  3241,  3892,  4135,  4356,  4405,  4430,  7986,  4642,
         4774,  4786,  4839,  4845,  5064,  5319,  5465,  5690,  5989,
         5999,  6012,  6022,  6177,  6223,  6491,  6579,  6651,  6821,
         6888,  6921,  7010,  6918,  7481,  7507,  6640,  7808,  6535,
         7904,  8015,  7003,  6932,  8200,  8696,  9362,  9420,  9493,
        10067, 10098, 10104, 10220, 10389, 12904, 10681, 10810, 10931,
        11521, 11533,  9498, 11855,  9644,  4893, 11973, 10927, 12178,
        12536, 12634, 12654, 12850, 12861, 12877, 12993, 13076, 13181,
        13192]),
 {0: array([  190,   196,   197,   199,   202,   203,   219,   231,   283,
           482,   719,   819,   878,   908,   911,   969,  1205,  1357,
          1606,  1801,  2132,  2160,  2180,  2260,  227

TypeError: 'range' object does not support item assignment

Plotar #clusters vs score

In [5]:
xToPlotMultinomial = np.array([])
yToPlotMultinomial = np.array([])

In [6]:
for i in [2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,60,70,80,90,100,150,200,250,300,350,400,450,500,600,700,800,900,1000,1250,1500,1750,2000]:
    kmeans = KMeans(n_clusters=i).fit(word2vec_health_df)
    labels = kmeans.labels_
    y = silhouette_score(word2vec_health_df, labels, metric='euclidean')
    print (i, y)
    xToPlotMultinomial = np.append(xToPlotMultinomial, i)
    yToPlotMultinomial = np.append(yToPlotMultinomial, y)    

2 0.03908658611283668
3 -0.00430108285326937
4 -0.002621928354708429
5 0.0012412083594899024
6 0.002276653033856137
7 0.0037428931255573186
8 -0.00914757404645449
9 -0.0006131528466196493
10 -0.006166060015071426
15 -0.014443515940912664
20 -0.012619448909842367
25 -0.012504643581691503
30 -0.012387144453954821
35 -0.012617174577412214
40 -0.014898263388296504
45 -0.008090218331597886
50 -0.01214085670886462
60 -0.015132723615727801
70 -0.013468448108032395
80 -0.012626246025424627
90 -0.011569774255073467
100 -0.017227395433269947
150 -0.013600488083977468
200 -0.004746744650822378
250 -0.0032215453665456516
300 0.0017870608248588286
350 0.0024797277128883175
400 0.00393258293750016
450 0.0089920181762587
500 0.01127026455086367
600 0.01721130319899552
700 0.020634916272375976
800 0.022398041891573933
900 0.03205546086349696
1000 0.029968156817045215
1250 0.037933953964750605
1500 0.0442307674254906
1750 0.04724972509744163
2000 0.048282650630924816


In [10]:
for i in (2,3,4,5,6,7,8,9,10):
# for i in (2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,60,70,80,90,100,150,200,250,300,350,400,450,500,1000,1500,2000):
    kmeans = KMeans(n_clusters=i).fit(word2vec_health_df)
    labels = kmeans.labels_
    silhouette_score(word2vec_health_df, labels, metric='euclidean')

KeyboardInterrupt: 

Using affinity propagation:

In [12]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df, labels, metric='euclidean')

  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


nan

# First attempt at bags
Apply the same logic to bags

kmeans = KMeans(n_clusters=clusters_amount).fit(bags_health_df)
labels = kmeans.labels_
silhouette_score(bags_health_df, labels, metric='euclidean')

Using affinity propagation:

In [13]:
affinity_prop = AffinityPropagation().fit(bags_health_df)
labels = affinity_prop.labels_
silhouettIe_score(bags_health_df, labels, metric='euclidean')

NameError: name 'silhouettIe_score' is not defined

# Now normalizing our input:
Repeating the previous procedures by normalizing the input.

In [7]:
word2vec_health_df_normalized = normalize(word2vec_health_df)
kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df_normalized)
labels = kmeans.labels_
silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

0.0024018701098438024

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df_normalized)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

Now for bags:

In [15]:
bags_health_df_normalized = normalize(bags_health_df)
kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df_normalized)
labels = kmeans.labels_
silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

0.0072076559123404145

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(bags_health_df_normalized)
labels = affinity_prop.labels_
silhouette_score(bags_health_df_normalized, labels, metric='euclidean')

# Changing strings to remove useless words
Opening and first look at the dataset

In [16]:
health_df = load_health_data()
health_df = health_df.drop(['id'], axis=1)
print(list(health_df))

['publish_date', 'headline_text']


Delete words without meaningful information for our context

In [17]:
for index, row in health_df.iterrows():
    word_list = row['headline_text'].split()
    deleted_words=["RT", "a", "are", "it", "the", "she", "you", "of", "to", "that's", "-", "on", "I", "he"]
    row['headline_text'] = ' '.join([word for word in word_list if word not in deleted_words])

In [None]:
xToPlotMultinomial2 = np.array([])
yToPlotMultinomial2 = np.array([])

In [8]:
for i in [2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,60,70,80,90,100,150,200,250,300,350,400,450,500,600,700,800,900,1000,1250,1500,1750,2000]:
    kmeans = KMeans(n_clusters=i).fit(word2vec_health_df_normalized)
    labels = kmeans.labels_
    y = silhouette_score(word2vec_health_df, labels, metric='euclidean')
    print (i, y)
    xToPlotMultinomial2 = np.append(xToPlotMultinomial, i)
    yToPlotMultinomial2 = np.append(yToPlotMultinomial, y)   

2 0.038246900583531696
3 -0.014538619788783435
4 -0.013010726135348157
5 -0.008908890192498211
6 -0.006659035136058353
7 -0.005173445929502646
8 -0.032556246049217634
9 -0.040199265597583016
10 -0.04090538773398011
15 -0.026733162599942267
20 -0.027635459415522836
25 -0.02975097438101436
30 -0.031774227318179915
35 -0.033667889230600566
40 -0.02883703809241266
45 -0.03856803769867616
50 -0.03145587709573127
60 -0.030928482782428712
70 -0.027485753765545955
80 -0.024932157229921224
90 -0.025958078204638416
100 -0.025916640705712587
150 -0.021761862775625942
200 -0.01234354312676646
250 -0.01584840340892631
300 -0.011242529673167667
350 -0.009496436347656314
400 -0.007642925636890754
450 -0.002426457265561866
500 0.0031035095212706802
600 0.008791864601099065
700 0.01105841367168745
800 0.014905105995740757
900 0.020427309029163017
1000 0.023860577705984896
1250 0.03382989655392453
1500 0.035237146069876366
1750 0.04359746515026018
2000 0.04202619642630321
