# Q2 K-means Clustering
Isaac Tabb

Text As Data

24/01/2023


### Step -1: Read in Data

This code imports the file from my local drive and then reads the file in as as Pandas DataFrame.

In [None]:
import pandas as pd
from google.colab import files
# upload file from computer
uploaded = files.upload()

import io 
# read csv into df
nbatweetsdf = pd.read_csv(io.BytesIO(uploaded['training_set.csv']))

Saving training_set.csv to training_set.csv


Now, let's make sure the data frame makes sense. It shows up with an unnamed row for index, we're gonna delete that one.

In [None]:
nbatweetsdf = nbatweetsdf.drop('Unnamed: 0', axis=1)
# turn df to dictionary
nbatweets = nbatweetsdf.to_dict('records')

### Step 0: Vectorize Text


Let's start by importing Spacy.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")



And now let's create our spacy pipeline.

In [None]:
def spacy_pipeline(tweet):
    tokens = []
    doc = nlp(tweet)
    for i in doc:
        # make sure no stopwords, spaces, or punctuation are kept
        if (not i.is_stop) and (not i.is_space) and (not i.is_punct):
            tokens.append(i.lemma_.lower())
    return tokens

Now let's print the first tweet row to make sure the dictionary looks right.


In [None]:
print(nbatweets[0])

{'text': '@Lakers @KingJames @AntDavis23 Thanks, @bakersfieldnow &amp; @Jon_Singh19.                       🗞📺🕙🏀', 'team': 'MiamiHeat', 'date_label': 'period3', 'follower_label': 'medium'}


It worked! Now we are going to create our corpus.

In [None]:
# create our corpus, a list of the tweet text
corpus = []
for tweet in nbatweets:
  corpus.append(tweet['text'])

And let's define out vectorizer using sklearn.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_vectorize_with_sklearn_and_spacy(text_corpus):
  # the vectorizer which uses the space pipeline to tokenize
  vectorizer = TfidfVectorizer(tokenizer = spacy_pipeline)
  # fit and transform on our corpus
  X = vectorizer.fit_transform(text_corpus)
  return X

Now let's run our new function on the tweets corpus and print the length and size of matrix.


In [None]:
# run the vectorization using spacy
tweets_tfidf_matrix = tfidf_vectorize_with_sklearn_and_spacy(corpus)
tweets_tfidf_matrix

<6000x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 51278 stored elements in Compressed Sparse Row format>

And let's just assert that our matrix makes sense.

In [None]:
print(f"len(posts)={len(nbatweets)}")
print(f"post_tfidf_matrix.shape={tweets_tfidf_matrix.shape}")

assert len(nbatweets) == tweets_tfidf_matrix.shape[0]

len(posts)=6000
post_tfidf_matrix.shape=(6000, 9958)


### Step 1: Pick k random centroids

The way that I will be finding random initial centroids is choosing K random documents in the dataset.

In [None]:
import random as rd

# define a function which takes the list of tweets and a value k
# the function chooses k random indexes to be out centroids
def k_random_centroids(tweets, k):
  # pick k random indexes, no repeats
  rand_centroid_indexes = rd.sample(range(len(nbatweets)-1), k)
  rand_centroids = []
  # retrieve the k random documents to be our centroids
  for i in rand_centroid_indexes:
    rand_centroids.append(tweets_tfidf_matrix[i,:])
  # return the starting centroids
  return rand_centroids

# initialize k as 5 for 5 clusters
k = 5
# retrieve the centroids
centroids = k_random_centroids(nbatweets, k)
print(centroids)


[<1x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>, <1x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>, <1x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>, <1x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>, <1x9958 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>]


### Step 2: Assign each vector to its closest centroid

Now we can assign all the vectors to centroids.

In [None]:
# this is a function to assign each vector to a centroid, it takes the index
# of the tweet, the current centroids list, and the tfidf matrix
def assign_vector_to_centroid(tweet_index, centroids, tweets_tfidf_matrix):
  # initialize a list to hold a tweets distance score from each centroid
  scores = []

  # iterate through the five centroids
  for centroid in centroids:
    # score the dot product
    dotprod = centroid.dot(tweets_tfidf_matrix[tweet_index,:].T)
    # save sim score
    sim_score = dotprod[0,0]
    # append the score to the scores list for each centroid
    scores.append(sim_score)

  # whichever score is the highest is the closest centroid
  high_score = max(scores)

  # get that index and return it, this is the cluster which the tweet is in
  index_of_closest = scores.index(high_score)
  return index_of_closest

# cluster counts is used to keep track of how many tweets are in each cluster
clus_counts = [0] * k

from tqdm import tqdm
# iterate through tweets
for tweet in tqdm(range(len(nbatweets))):
  # assign each tweet to a cluster
  nbatweets[tweet]['cluster'] = assign_vector_to_centroid(tweet, centroids, tweets_tfidf_matrix)
  # increment the cluster counts
  clus_counts[nbatweets[tweet]['cluster']] += 1


100%|██████████| 6000/6000 [00:12<00:00, 485.46it/s]


### Step 3: Recalculate the centroids based on the closest vectors

First, I define a function recalculate the centroids.

In [None]:
from scipy.sparse import csr_matrix

# below is a function which recalculates each of the centroids
# this function takes the list of nbatweets, the cluster counts, and k
def recalculate_centroids(nbatweets, clus_counts, k):
  # create a new list of centroids, same length as last
  rec_centroids = []
  for i in range(k):
    rec_centroids.append({})

  # iterate through the nbatweets
  for tweet in tqdm(range(len(nbatweets))):
    # retreieve the cluster the tweet is in
    clus = nbatweets[tweet]['cluster']
    # retrieve the rows and columns of the current tweet's sparse matrix row
    # https://stackoverflow.com/questions/15115765/how-to-access-sparse-matrix-elements
    rows, cols = tweets_tfidf_matrix[tweet,:].nonzero()
    # iterate through each one of the tokens
    for col in cols:
      # if the token is already in the current dictionary for the new centroid
      if col in rec_centroids[clus]:
        # increment that token's item by the current value
        rec_centroids[clus][col] += tweets_tfidf_matrix[tweet,:][0,col]
      # else the token has not been included yet so set it to the current value
      else:
        rec_centroids[clus][col] = tweets_tfidf_matrix[tweet,:][0,col]
    
  # iterate through the centroids
  for centroid in range(len(rec_centroids)):
    # iterate through key value pairs in current centroid
    for key, item in rec_centroids[centroid].items():
      # average the values by how many tweets were in that cluster
      rec_centroids[centroid][key] = item / clus_counts[centroid]

  # this list will hold the final recalculated centroids
  final_rec_centroids = []
  # convert dictionary back to sparse matrix
  for centroid in rec_centroids:
    matrix = csr_matrix((1, tweets_tfidf_matrix.shape[1]))
    for key, item in centroid.items():
      matrix[0, key] = item
    
    # final list of sparse matrices
    final_rec_centroids.append(matrix)

  # return the new average centroids
  return final_rec_centroids


Now I will call the recalculate centroids function on the tweets.

In [None]:
rec_centroids = recalculate_centroids(nbatweets, clus_counts, k)

100%|██████████| 6000/6000 [00:07<00:00, 816.79it/s]
  self._set_intXint(row, col, x.flat[0])


### Step 4: Repeat Step 2 & 3 until converge

Here is a function which calculates if two lists of matrices are equal.

In [None]:
# this function takes in recalculated centroids and original centroids and
# checks if the lists have converged
def check_converge(recs, cents):
  # will count matching centroids
  count_matching = 0
  # iterate through past centroids
  for cent in cents:
    # iterate through current centroids
    for rec in recs:
      # iterate thru centroids and find how many of the centroids are equal between the two lists
      # https://stackoverflow.com/questions/30685024/check-if-two-scipy-sparse-csr-matrix-are-equal
      if (cent != rec).nnz==0:
        count_matching += 1
  
  # if the number of matching centroids is greater or equal to the number of centroids
  # then the two lists are equal
  if count_matching >= len(cents):
    return True
  else:
    return False

Now we will run Step 4.

In [None]:
# check if the recalculated centroids are equal to the last pass centroids
conv = check_converge(rec_centroids, centroids)

# while the recalculated centroids and the old ones don't converge
while not(conv):
  # set the centroids list to the new recalculated centroids
  centroids = rec_centroids

  # repeat steps 2 and 3 below
  clus_counts = [0] * k
  for tweet in tqdm(range(len(nbatweets))):
    nbatweets[tweet]['cluster'] = assign_vector_to_centroid(tweet, centroids, tweets_tfidf_matrix)
    clus_counts[nbatweets[tweet]['cluster']] += 1
  
  rec_centroids = recalculate_centroids(nbatweets, clus_counts, k)
  conv = check_converge(rec_centroids, centroids)
  
  


100%|██████████| 6000/6000 [00:12<00:00, 482.27it/s]
100%|██████████| 6000/6000 [00:07<00:00, 770.37it/s]
100%|██████████| 6000/6000 [00:12<00:00, 488.62it/s]
100%|██████████| 6000/6000 [00:08<00:00, 714.93it/s]
100%|██████████| 6000/6000 [00:12<00:00, 491.49it/s]
100%|██████████| 6000/6000 [00:07<00:00, 752.97it/s]
100%|██████████| 6000/6000 [00:12<00:00, 485.62it/s]
100%|██████████| 6000/6000 [00:11<00:00, 501.70it/s]
100%|██████████| 6000/6000 [00:12<00:00, 486.91it/s]
100%|██████████| 6000/6000 [00:08<00:00, 722.57it/s]
100%|██████████| 6000/6000 [00:12<00:00, 482.86it/s]
100%|██████████| 6000/6000 [00:08<00:00, 734.67it/s]
100%|██████████| 6000/6000 [00:12<00:00, 479.57it/s]
100%|██████████| 6000/6000 [00:07<00:00, 755.33it/s]
100%|██████████| 6000/6000 [00:12<00:00, 487.77it/s]
100%|██████████| 6000/6000 [00:08<00:00, 742.38it/s]
100%|██████████| 6000/6000 [00:12<00:00, 483.25it/s]
100%|██████████| 6000/6000 [00:08<00:00, 726.89it/s]
100%|██████████| 6000/6000 [00:12<00:00, 483.3

In [None]:
print(clus_counts)

[2107, 619, 1046, 667, 1561]


### Looking at the distribution

Let's look at some of the documents in the clusters.

In [None]:
# print the first 40 tweets, examining which clusters they are in
for tweet in range(0, 40):
  print(nbatweets[tweet]['text'])
  print(nbatweets[tweet]['cluster'])

Let's look at the top words in each cluster.

First, define a method to find the five max magnitudes for a certain centroid.

In [None]:
def second_elem(e):
    return e[1]

# this function finds the five maxes of the centroid list
def five_maxes(rec_centroid, k):
  temp = rec_centroid
  maxes = []
  # gets all of the nonzero rows and columns
  rows, cols = rec_centroid[0,:].nonzero()
  # iterates through nonzero tokens
  for col in cols:
    # if maxes hasnt been filled yet automatically append
    if len(maxes) < 5:
      maxes.append([col, rec_centroid[0, col]])
    # if there are 5 values in maxes, check if the current magnitude is greater than the smallest one in maxes list
    # if so append
    elif rec_centroid[0, col] > maxes[len(maxes)-1][1]:
      maxes[len(maxes)-1] = [col, rec_centroid[0, col]]
      maxes.sort(key=second_elem, reverse=True)
  return maxes


Define our vectorizer again.

In [None]:
vectorizer = TfidfVectorizer(tokenizer = spacy_pipeline)
X = vectorizer.fit_transform(corpus)

Then iterate through our centroids finding the top 5 magnitudes for each.

In [None]:
for i in range(k):
  maxes = five_maxes(rec_centroids[i], k)
  max_terms = []
  # find the actual terms that have the high magnitudes
  for max in maxes:
    max_terms.append([vectorizer.get_feature_names_out()[max[0]], max[1]])

  print(max_terms)

[['game', 0.07429247140526307], ['heat', 0.052523823893616366], ['laker', 0.04390201429783622], ['final', 0.04321583022745221], ['nba', 0.037732982529937675]]
[['good', 0.08795186632143101], ['player', 0.02920071365269986], ['time', 0.028479433422451118], ['great', 0.018762881757477406], ['bam', 0.01857516419344807]]
[['herro', 0.053054372854072274], ['jimmy', 0.05086146294566495], ['tyler', 0.039160341898404415], ['😂', 0.03547664279140322], ['butler', 0.03311515883286048]]
[['team', 0.07124559002642394], ['love', 0.03211651600473568], ['podcast', 0.02159350592628673], ['murray', 0.020847178457307162], ['nugget', 0.018334599764601816]]
[['🔥', 0.11043202073074147], ['let', 0.09978538202567205], ['win', 0.048848666669937583], ['@miamiheat', 0.04876991577260555], ['🏀', 0.04347465642890756]]


Let's also see the label distribution.

In [None]:
# define 5 lists of cluster labels
clusters_labels = [[], [], [], [], []]
for tweet in nbatweets:
  if tweet['cluster'] == 0:
    clusters_labels[0].append(tweet['team'])
  elif tweet['cluster'] == 1:
    clusters_labels[1].append(tweet['team'])
  elif tweet['cluster'] == 2:
    clusters_labels[2].append(tweet['team'])
  elif tweet['cluster'] == 3:
    clusters_labels[3].append(tweet['team'])
  else:
    clusters_labels[4].append(tweet['team'])

# print the clusters label count
print(pd.Series(clusters_labels[0]).value_counts())
print(pd.Series(clusters_labels[1]).value_counts())
print(pd.Series(clusters_labels[2]).value_counts())
print(pd.Series(clusters_labels[3]).value_counts())
print(pd.Series(clusters_labels[4]).value_counts())

Write dictionary to a file to keep the cluster data.


In [None]:
import csv

keys = nbatweets[0].keys()

with open('nbatweets_kmeans.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(nbatweets)