In [None]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [None]:
tweets = []
with open('usnewshealth.txt', 'r') as file:
    for line in file:
        tweets.append(line.strip())

print(f"Total number of tweets: {len(tweets)}")
print("\nTweets:")
for i, tweet in enumerate(tweets[:10]):
    print(f"Tweet {i+1}: {tweet}")

Total number of tweets: 1400

Tweets:
Tweet 1: 586278450392133633|Thu Apr 09 21:24:09 +0000 2015|Planning to hire a personal trainer? Read these 7 tips first: http://ow.ly/LpxFq
Tweet 2: 586260156155043843|Thu Apr 09 20:11:28 +0000 2015|RT @AnnaMedaris: Any dads out their who struggled w/ #depression or #anxiety after their kid was born? Let's talk! amiller[at]usnews[dot]coâ€¦
Tweet 3: 586248551811932160|Thu Apr 09 19:25:21 +0000 2015|America's problem with diabetes in one map: http://ow.ly/LoXQG  by @leonardkl @Lindzcook http://pbs.twimg.com/media/CCLF85lW8AAVP4Z.png
Tweet 4: 586229697165586432|Thu Apr 09 18:10:26 +0000 2015|Think water &amp; fiber will cure your constipation? Unlikely. Here's why: http://ow.ly/LoBLH
Tweet 5: 586215972731822080|Thu Apr 09 17:15:53 +0000 2015|About to lose it? Here, try one of these office-approved #relaxation exercises: http://ow.ly/LoBkH #stress
Tweet 6: 586202004583768064|Thu Apr 09 16:20:23 +0000 2015|Should you get your baby's DNA decoded? http://

In [None]:
def clean_tweet(tweet_text):
    # Remove tweet id and timestamp
    parts = tweet_text.split('|', 2)
    if len(parts) > 2:
        content = parts[2]
    else:
        content = tweet_text

    # Remove words with @
    content = re.sub(r'@\w+', '', content)

    # Remove URLs
    content = re.sub(r'http\S+|www\S+', '', content)

    # Remove hashtag but keep word
    content = re.sub(r'#', '', content)

    # Convert to lowercase
    content = content.lower()

    # Remove special characters
    content = re.sub(r'[^a-z0-9\s]', '', content)

    return content.strip()

print("Tweets Cleaned")

Tweets Cleaned


In [None]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

print("Tweets:")
for i, tweet in enumerate(cleaned_tweets[:10]):
    print(f"Tweet {i+1}: {tweet}")

Tweets:
Tweet 1: planning to hire a personal trainer read these 7 tips first
Tweet 2: rt  any dads out their who struggled w depression or anxiety after their kid was born lets talk amilleratusnewsdotco
Tweet 3: americas problem with diabetes in one map   by
Tweet 4: think water amp fiber will cure your constipation unlikely heres why
Tweet 5: about to lose it here try one of these officeapproved relaxation exercises  stress
Tweet 6: should you get your babys dna decoded  genetics
Tweet 7: 3 easy ways to eliminate work stress  via
Tweet 8: 7 steps for choosing amp keeping the right personal trainer  by  personaltraining
Tweet 9: getting ready for bikini season dont  with the antibikini season diet
Tweet 10: theres a reason youre still fat in fact 6 of them  weightloss


In [None]:
def jaccard_distance(s1, s2):
    # Turn tweets into sets of words
    words1 = set(s1.split())
    words2 = set(s2.split())

    # Find intersection and union of the sets
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))

    # Empty set
    if union == 0:
        return 0.0

    # Calculate Jaccard Distance
    jaccard_similarity = intersection / union
    jaccard_dist = 1 - jaccard_similarity
    return jaccard_dist

In [None]:
num_tweets = len(cleaned_tweets)
jaccard_matrix = np.zeros((num_tweets, num_tweets))

for i in range(num_tweets):
    for j in range(i, num_tweets):
        dist = jaccard_distance(cleaned_tweets[i], cleaned_tweets[j])
        jaccard_matrix[i, j] = dist
        jaccard_matrix[j, i] = dist

print(f"Jaccard distance matrix of shape: {jaccard_matrix.shape}")
print("\nBlock of the Jaccard distance matrix:")
display(pd.DataFrame(jaccard_matrix[:10, :10],
                     columns=[f"Tweet {k+1}" for k in range(10)],
                     index=[f"Tweet {k+1}" for k in range(10)]))

Jaccard distance matrix of shape: (1400, 1400)

Block of the Jaccard distance matrix:


Unnamed: 0,Tweet 1,Tweet 2,Tweet 3,Tweet 4,Tweet 5,Tweet 6,Tweet 7,Tweet 8,Tweet 9,Tweet 10
Tweet 1,0.0,1.0,1.0,1.0,0.909091,1.0,0.944444,0.85,1.0,0.954545
Tweet 2,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Tweet 3,1.0,1.0,0.0,1.0,0.95,1.0,1.0,0.947368,0.941176,0.947368
Tweet 4,1.0,1.0,1.0,0.0,1.0,0.944444,1.0,0.954545,1.0,1.0
Tweet 5,0.909091,1.0,0.95,1.0,0.0,1.0,0.894737,1.0,1.0,0.958333
Tweet 6,1.0,1.0,1.0,0.944444,1.0,0.0,1.0,1.0,1.0,1.0
Tweet 7,0.944444,1.0,1.0,1.0,0.894737,1.0,0.0,1.0,1.0,1.0
Tweet 8,0.85,1.0,0.947368,0.954545,1.0,1.0,1.0,0.0,0.9,1.0
Tweet 9,1.0,1.0,0.941176,1.0,1.0,1.0,1.0,0.9,0.0,1.0
Tweet 10,0.954545,1.0,0.947368,1.0,0.958333,1.0,1.0,1.0,1.0,0.0


In [None]:
def initialize_centroids(data_points, k):
    num_data_points = len(data_points)
    if k > num_data_points:
        k = num_data_points

    # k random tweets selected as centroids
    centroid_indices = np.random.choice(num_data_points, k, replace=False)
    return centroid_indices

In [None]:
def assign_to_clusters(jaccard_matrix, centroid_indices, num_data_points):
    clusters = []

    for i in range(num_data_points):
        min_distance = float('inf')
        closest_centroid_index = -1

        for centroid_id_in_list, centroid_data_id in enumerate(centroid_indices):
            distance = jaccard_matrix[i, centroid_data_id]
            if distance < min_distance:
                min_distance = distance
                closest_centroid_index = centroid_id_in_list
        clusters.append(closest_centroid_index)
    return np.array(clusters)

In [None]:
def update_centroids(jaccard_matrix, clusters, num_clusters, old_centroid_indices):
    new_centroid_indices = []
    for k in range(num_clusters):
        # Find all data points assigned to the current cluster k
        cluster_members_indices = np.where(clusters == k)[0]

        if len(cluster_members_indices) == 0:
            # If a cluster is empty, keep the old centroid
            new_centroid_indices.append(old_centroid_indices[k])
            continue

        min_sum_distance = float('inf')
        current_centroid_candidate_id = -1

        # Iterate through each data point in the cluster as a potential new centroid
        for i in cluster_members_indices:
            sum_distance = 0.0
            # Calculate the sum of distances from data point to all other points in the cluster
            for j in cluster_members_indices:
                sum_distance += jaccard_matrix[i, j]

            # If data point has smaller sum of distances, it becomes new centroid
            if sum_distance < min_sum_distance:
                min_sum_distance = sum_distance
                current_centroid_candidate_id = i

        new_centroid_indices.append(current_centroid_candidate_id)

    return np.array(new_centroid_indices)

In [None]:
def k_centroids(jaccard_matrix, num_clusters, max_iterations=100):
    num_data_points = jaccard_matrix.shape[0]

    # Initialize centroids
    final_centroids = initialize_centroids(range(num_data_points), num_clusters)

    for iteration in range(max_iterations):
        clusters = assign_to_clusters(jaccard_matrix, final_centroids, num_data_points)
        old_centroids = np.copy(final_centroids)

        # Update centroids
        final_centroids = update_centroids(jaccard_matrix, clusters, num_clusters, old_centroids)

        # Check if centroids converge
        if np.array_equal(final_centroids, old_centroids):
            break

    return clusters, final_centroids

In [None]:
def calculate_sse(jaccard_matrix, final_clusters, final_centroids):
    sse = 0.0
    num_data_points = jaccard_matrix.shape[0]

    for i in range(num_data_points):
        # Get cluster from data point
        cluster_id = final_clusters[i]

        # Get centroid id from cluster
        centroid_data_id = final_centroids[cluster_id]

        # Get distance between data point and centroid
        distance = jaccard_matrix[i, centroid_data_id]

        # Add squared distance to SSE
        sse += (distance ** 2)

    return sse

In [None]:
k_values = [1, 10, 50, 100, 500]

In [None]:
sse_values = []
cluster_size_info = []

for k_val in k_values:
    print(f"\nK = {k_val}")

    final_clusters, final_centroids = k_centroids(jaccard_matrix, k_val)

    # Calculate SSE
    sse = calculate_sse(jaccard_matrix, final_clusters, final_centroids)
    sse_values.append(sse)
    print(f"SSE: {sse:.4f}")

    # Count the number of data points in each cluster
    cluster_counts = np.bincount(final_clusters)
    current_k_cluster_sizes = {f"Cluster {i+1}": count for i, count in enumerate(cluster_counts)}
    cluster_size_info.append(current_k_cluster_sizes)

    print("Number of tweets for each cluster:")
    for i, count in enumerate(cluster_counts):
        if i < len(final_centroids):
            print(f"Cluster {i+1}: {count} tweets")


K = 1
SSE: 1209.2106
Number of tweets for each cluster:
Cluster 1: 1400 tweets

K = 10
SSE: 1076.5593
Number of tweets for each cluster:
Cluster 1: 118 tweets
Cluster 2: 66 tweets
Cluster 3: 48 tweets
Cluster 4: 61 tweets
Cluster 5: 116 tweets
Cluster 6: 56 tweets
Cluster 7: 240 tweets
Cluster 8: 270 tweets
Cluster 9: 357 tweets
Cluster 10: 68 tweets

K = 50
SSE: 966.7508
Number of tweets for each cluster:
Cluster 1: 44 tweets
Cluster 2: 14 tweets
Cluster 3: 9 tweets
Cluster 4: 46 tweets
Cluster 5: 10 tweets
Cluster 6: 27 tweets
Cluster 7: 26 tweets
Cluster 8: 46 tweets
Cluster 9: 28 tweets
Cluster 10: 38 tweets
Cluster 11: 7 tweets
Cluster 12: 17 tweets
Cluster 13: 14 tweets
Cluster 14: 101 tweets
Cluster 15: 5 tweets
Cluster 16: 5 tweets
Cluster 17: 5 tweets
Cluster 18: 76 tweets
Cluster 19: 32 tweets
Cluster 20: 25 tweets
Cluster 21: 9 tweets
Cluster 22: 40 tweets
Cluster 23: 14 tweets
Cluster 24: 12 tweets
Cluster 25: 18 tweets
Cluster 26: 42 tweets
Cluster 27: 8 tweets
Cluster 28

In [None]:
results_data = []

for i, k_val in enumerate(k_values):
    results_data.append({
        'Value of K': k_val,
        'SSE Value': round(sse_values[i], 4),
        'Cluster Sizes': cluster_size_info[i]
    })

results_df = pd.DataFrame(results_data)

In [None]:
# Write data to log file
logging.basicConfig(filename = 'kmeans.log', level = logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force = True)

for result in results_data:
    k_value = result['Value of K']
    sse_value = result['SSE Value']
    cluster_sizes = result['Cluster Sizes']

    log_message = f"K Value: {k_value}, SSE: {sse_value:.4f}\n"
    log_message += "Cluster Sizes:\n"
    for cluster, size in cluster_sizes.items():
        log_message += f"{cluster}: {size} tweets\n"

    logging.info(log_message)