# Before Running
Add files health.txt, bags.csv and word2vec.csv to the health-dataset folder.

# Introduction
Here, we'll explore k-means clustering algorithm applied to health news in twitter.

In [2]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

In [3]:
def load_health_data():
    return pd.read_csv('health-dataset/health.txt', sep='|')
def load_word2vec_data():
    return pd.read_csv('health-dataset/word2vec.csv', header=None)
def load_bags_data():
    return pd.read_csv('health-dataset/bags.csv', header=None)

First, we load our data and set the amount of clusters for K-means clustering:

In [4]:
word2vec_health_df = load_word2vec_data()
bags_health_df = load_bags_data()
clusters_amount=100

# First attempt at word2vec
First we'll get the results using the given word2vec.
Using KMeans we have:

In [9]:
results = []
for i in range(10):
    kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df)
    labels = kmeans.labels_
    score = silhouette_score(word2vec_health_df, labels, metric='euclidean')
    results.append(score)
print(results)

[-0.015940999601083715, -0.014425208883304324, -0.00974037399571396, -0.013069620033711508, -0.011229630601096406, -0.008885638388808596, -0.017555214676478446, -0.012369002628226383, -0.011897034097081188, -0.011048152533666502]


Using affinity propagation:

In [6]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df, labels, metric='euclidean')

KeyboardInterrupt: 

# First attempt at bags
Apply the same logic to bags

In [10]:
results = []
for i in range(10):
    kmeans = KMeans(n_clusters=clusters_amount).fit(bags_health_df)
    labels = kmeans.labels_
    score = silhouette_score(bags_health_df, labels, metric='euclidean')
    results.append(score)
print(results)

[0.043758999024727734, 0.04773173142022052, 0.04167101899323025, 0.04534850142119255, 0.04437748881012685, 0.048156923887279594, 0.057161779497560364, 0.04707591490979031, 0.048061081165205334, 0.048292683122344894]


Using affinity propagation:

In [35]:
affinity_prop = AffinityPropagation().fit(bags_health_df)
labels = affinity_prop.labels_
silhouette_score(bags_health_df, labels, metric='euclidean')

  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


nan

# Now normalizing our input:
Repeating the previous procedure for word2vec, after normalizing the input.

In [11]:

word2vec_health_df_normalized = normalize(word2vec_health_df)
results = []
for i in range(10):
    kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df_normalized)
    labels = kmeans.labels_
    score = silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')
    results.append(score)
print(results)

[0.0077039713188821425, 0.0066941196706847925, 0.006419732901675516, 0.005112464025930685, 0.004105530803775869, 0.0038602845216841814, 0.0026670204748496375, 0.00574229159399545, 0.006119454302332454, 0.007489515755445596]


Using affinity propagation:

In [37]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df_normalized)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
  sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)


nan

# Changing strings to remove useless words
Opening and first look at the dataset

In [40]:
health_df = load_health_data()
health_df = health_df.drop(['id'], axis=1)
print(list(health_df))

['publish_date', 'headline_text']


Delete words without meaningful information for our context

In [None]:
for index, row in health_df.iterrows():
    word_list = row['headline_text'].split()
    deleted_words=["RT", "a", "are", "it", "the", "she", "you", "of", "to", "that's", "-", "on", "I", "he"]
    row['headline_text'] = ' '.join([word for word in word_list if word not in deleted_words])