# Before Running
Add files health.txt, bags.csv and word2vec.csv to the health-dataset folder.

# Introduction
Here, we'll explore k-means clustering algorithm applied to health news in twitter.

In [19]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

In [16]:
def load_health_data():
    return pd.read_csv('health-dataset/health.txt', sep='|')
def load_word2vec_data():
    return pd.read_csv('health-dataset/word2vec.csv')
def load_bags_data():
    return pd.read_csv('health-dataset/bags.csv')

First, we load our data and set the amount of clusters for K-means clustering:

In [None]:
word2vec_health_df = load_word2vec_data()
bags_health_df = load_bags_data()
clusters_amount=100

# First attempt at word2vec
First we'll get the results using the given word2vec.
Using KMeans we have:

In [14]:
kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df)
labels = kmeans.labels_
metrics.silhouette_score(word2vec_health_df, labels, metric='euclidean')

0.03299543980949137

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df, labels, metric='euclidean')

# First attempt at bags
Apply the same logic to bags

kmeans = KMeans(n_clusters=clusters_amount).fit(bags_health_df)
labels = kmeans.labels_
silhouette_score(bags_health_df, labels, metric='euclidean')

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(bags_health_df)
labels = affinity_prop.labels_
silhouette_score(bags_health_df, labels, metric='euclidean')

# Now normalizing our input:
Repeating the previous procedures by normalizing the input.

In [None]:
word2vec_health_df_normalized = normalize(word2vec_health_df)
kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df_normalized)
labels = kmeans.labels_
metrics.silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(word2vec_health_df_normalized)
labels = affinity_prop.labels_
silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

Now for bags:

In [None]:
bags_health_df_normalized = normalize(bags_health_df)
kmeans = KMeans(n_clusters=clusters_amount).fit(word2vec_health_df_normalized)
labels = kmeans.labels_
metrics.silhouette_score(word2vec_health_df_normalized, labels, metric='euclidean')

Using affinity propagation:

In [None]:
affinity_prop = AffinityPropagation().fit(bags_health_df_normalized)
labels = affinity_prop.labels_
silhouette_score(bags_health_df_normalized, labels, metric='euclidean')

# Changing strings to remove useless words
Opening and first look at the dataset

In [56]:
health_df = load_health_data()
health_df = health_df.drop(['id'], axis=1)
print(list(health_df))

['publish_date', 'headline_text']


Delete words without meaningful information for our context

In [57]:
for index, row in health_df.iterrows():
    word_list = row['headline_text'].split()
    deleted_words=["RT", "a", "are", "it", "the", "she", "you", "of", "to", "that's", "-", "on", "I", "he"]
    row['headline_text'] = ' '.join([word for word in word_list if word not in deleted_words])