In [None]:
# Importing the libraries
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
import pandas as pd
import numpy as np
import random as rd
import re
import math
import string


# removes the irrelevant info from tweets liks timestamp and tweetid
# remove links 
# removes any special characters present
# covert all text to lowercase  
def PreprocessingTweets(inputdf):
  df = inputdf[0].str.split('|', expand=True)
  print("\nAfter Splitting the data with '|' delimeter \n")
  display(df)
  #splitting results in  index|id|timestamp|text|none
  #we only need column 2 
  df.drop([0,1,3, 4], axis=1,inplace=True)
  print("\nAfter dropping the irrelevant columns \n")
  display(df)

  #replace special characters with empty string
  df['text']=df[2].str.replace('(\@\w+.*?)',"")

  #clean_test has tweet and a link , seperate those two 
  df['links'] = df['text'].apply(lambda x: re.split('http:\/\/.*', str(x))[0])
  df['tweets']=df['links'].str.replace('#',"")
  df['clear_text']=df['tweets'].str.replace('RT :',"")

  #display(df)
  return df 

#Compute the jaccard distance 
def computeJaccardDistance(src,dst):
  union = set().union(src,dst)
  intersect = set(src).intersection(dst)
  return 1 - (len(intersect) / len(union))

#Check for convergence
def CheckForConvergence(oldCentroid,newCentroid):
  if len(oldCentroid) != len(newCentroid):
    return False

  for index in range(len(newCentroid)):
    if " ".join(newCentroid[index]) != " ".join(oldCentroid[index]):
      return False
  return True

#allocate initrial centroids 
def AllocateCentroid(tweets, centroids):
  clusters = dict()
  for tweetIndex in range(len(tweets)):
    minimumDistance = math.inf
    clusterIndex = -1;
    for centroidsIndex in range(len(centroids)):
      jaccardDistance = computeJaccardDistance(centroids[centroidsIndex], tweets[tweetIndex])
      if centroids[centroidsIndex] == tweets[tweetIndex]:
        #print("centroidsIndex: " + str(centroidsIndex) + ", tweetIndex : " + str(tweetIndex))
        clusterIndex = centroidsIndex
        minimumDistance = 0
        break

      if jaccardDistance < minimumDistance:
        clusterIndex = centroidsIndex
        minimumDistance = jaccardDistance

    if minimumDistance == 1:
        clusterIndex = rd.randint(0, len(centroids) - 1)

    clusters.setdefault(clusterIndex, []).append([tweets[tweetIndex]])
    lastTweetIndex = len(clusters.setdefault(clusterIndex, [])) - 1
    clusters.setdefault(clusterIndex, [])[lastTweetIndex].append(minimumDistance)

  return clusters

#Find the new centroids and update the current one 
def UpdateCentroid(clusterComponents):
  centroids = []
  for i in range(len(clusterComponents)):
    minTotalDistance = math.inf
    centroidIndex = -1
    minimumDistance = []
    for l in range(len(clusterComponents[i])):
      minimumDistance.append([])
      totalDistance = 0
      for m in range(len(clusterComponents[i])):
        if l != m:
          if m < l:
            distance = minimumDistance[m][l]
          else:
            distance = computeJaccardDistance(clusterComponents[i][l][0], clusterComponents[i][m][0])
          minimumDistance[l].append(distance)
          totalDistance = totalDistance + distance
        else:
          minimumDistance[l].append(0)

        if totalDistance < minTotalDistance:
          minTotalDistance = totalDistance
          centroidIndex = l

    centroids.append(clusterComponents[i][centroidIndex][0])
  return centroids

#Main Algorithm 
def kMeansAlgorithm(tweets ,k=3, maxIterations=30):
  curIterration = 0 #iter_count 
  givenTweet = tweets #list_of_twitter_tweets
  centroidsList = [] #list_of_centroids
  centroidListHistory = [] #prev_list_of_centroids
  tweetIndexMap = dict() #hash_map

  clusterCount = 0;

  while clusterCount < k:
    index = rd.randint(0, len(givenTweet) - 1)
    if index not in tweetIndexMap:
      clusterCount = clusterCount + 1
      tweetIndexMap[index] = True
      centroidsList.append(givenTweet[index])

  while (CheckForConvergence(centroidListHistory, centroidsList)) == False and (curIterration < maxIterations):
    clusterComponents = AllocateCentroid(givenTweet, centroidsList)

    #print("Cluster Component \n")
    #print(clusterComponents)
    centroidListHistory = centroidsList
    centroidsList = UpdateCentroid(clusterComponents)
    curIterration = curIterration + 1
  
  error = 0
  for s in range(len(clusterComponents)):
    for t in range(len(clusterComponents[s])):
      error = error + (clusterComponents[s][t][1] * clusterComponents[s][t][1])
  return clusterComponents,error

#get size of the cluster 
def GetClusterSize(clusters):
  rVal = ''
  for i in range(0,len(list(clusters.keys()))):
    rVal = rVal + 'Cluster '+str(i)+' : '+str(len(clusters[i]))+' tweets  '
  return rVal

# Execution starts from here 
inputDataFrame  = pd.read_csv('https://raw.githubusercontent.com/pdsouza01/CS6375_ML/main/cnnhealth.txt', header=None, sep='\n')
print("\nGiven Dataframe\n")
display(inputDataFrame)

processedData = PreprocessingTweets(inputDataFrame)
print("\nDataframe after preprocessing \n")
display(processedData)

kValue=[]
squaredErrorList=[]
clusterList=[]
for i in range(5,11):
  clusters, squaredError = kMeansAlgorithm(processedData['clear_text'].tolist() , k=i)
  kValue.append(2+i)
  squaredErrorList.append(round(squaredError,3))
  clusterList.append(GetClusterSize(clusters))

results = pd.DataFrame(list(zip(kValue, squaredErrorList, clusterList)), columns=['K', 'SSE', 'ClusterSizes'])
print("\nResults\n")
display(results)


Given Dataframe



Unnamed: 0,0
0,576880531301801984|Sat Mar 14 23:00:11 +0000 2...
1,576820122666471424|Sat Mar 14 19:00:08 +0000 2...
2,576744652717461504|Sat Mar 14 14:00:15 +0000 2...
3,576736754436304896|Sat Mar 14 13:28:52 +0000 2...
4,576736614766010368|Sat Mar 14 13:28:18 +0000 2...
...,...
4056,239699936671854593|Sun Aug 26 12:24:52 +0000 2...
4057,239504620710420480|Sat Aug 25 23:28:46 +0000 2...
4058,239410205757145088|Sat Aug 25 17:13:35 +0000 2...
4059,239386320416428032|Sat Aug 25 15:38:41 +0000 2...



After Splitting the data with '|' delimeter 



Unnamed: 0,0,1,2,3,4
0,576880531301801984,Sat Mar 14 23:00:11 +0000 2015,An abundance of online info can turn us into e...,,
1,576820122666471424,Sat Mar 14 19:00:08 +0000 2015,A plant-based diet that incorporates fish may ...,,
2,576744652717461504,Sat Mar 14 14:00:15 +0000 2015,It doesn't take much to damage your hearing at...,,
3,576736754436304896,Sat Mar 14 13:28:52 +0000 2015,RT @CNN: Forever young? Discover this island’s...,,
4,576736614766010368,Sat Mar 14 13:28:18 +0000 2015,RT @CNN: Is post-traumatic stress disorder in ...,,
...,...,...,...,...,...
4056,239699936671854593,Sun Aug 26 12:24:52 +0000 2012,RT @EverydayHealth: Want killer abs? @JillianM...,,
4057,239504620710420480,Sat Aug 25 23:28:46 +0000 2012,Medicare at stake -- @sanjayguptaCNN talks abo...,,
4058,239410205757145088,Sat Aug 25 17:13:35 +0000 2012,Ann Romney talks about her experience with MS ...,,
4059,239386320416428032,Sat Aug 25 15:38:41 +0000 2012,Make sure your first marathon isn't your last!...,,



After dropping the irrelevant columns 



Unnamed: 0,2
0,An abundance of online info can turn us into e...
1,A plant-based diet that incorporates fish may ...
2,It doesn't take much to damage your hearing at...
3,RT @CNN: Forever young? Discover this island’s...
4,RT @CNN: Is post-traumatic stress disorder in ...
...,...
4056,RT @EverydayHealth: Want killer abs? @JillianM...
4057,Medicare at stake -- @sanjayguptaCNN talks abo...
4058,Ann Romney talks about her experience with MS ...
4059,Make sure your first marathon isn't your last!...



Dataframe after preprocessing 





Unnamed: 0,2,text,links,tweets,clear_text
0,An abundance of online info can turn us into e...,An abundance of online info can turn us into e...,An abundance of online info can turn us into e...,An abundance of online info can turn us into e...,An abundance of online info can turn us into e...
1,A plant-based diet that incorporates fish may ...,A plant-based diet that incorporates fish may ...,A plant-based diet that incorporates fish may ...,A plant-based diet that incorporates fish may ...,A plant-based diet that incorporates fish may ...
2,It doesn't take much to damage your hearing at...,It doesn't take much to damage your hearing at...,It doesn't take much to damage your hearing at...,It doesn't take much to damage your hearing at...,It doesn't take much to damage your hearing at...
3,RT @CNN: Forever young? Discover this island’s...,RT : Forever young? Discover this island’s sec...,RT : Forever young? Discover this island’s sec...,RT : Forever young? Discover this island’s sec...,Forever young? Discover this island’s secrets...
4,RT @CNN: Is post-traumatic stress disorder in ...,RT : Is post-traumatic stress disorder in your...,RT : Is post-traumatic stress disorder in your...,RT : Is post-traumatic stress disorder in your...,Is post-traumatic stress disorder in your gen...
...,...,...,...,...,...
4056,RT @EverydayHealth: Want killer abs? @JillianM...,RT : Want killer abs? shows you how get them:...,RT : Want killer abs? shows you how get them:,RT : Want killer abs? shows you how get them:,Want killer abs? shows you how get them:
4057,Medicare at stake -- @sanjayguptaCNN talks abo...,Medicare at stake -- talks about politicians'...,Medicare at stake -- talks about politicians'...,Medicare at stake -- talks about politicians'...,Medicare at stake -- talks about politicians'...
4058,Ann Romney talks about her experience with MS ...,Ann Romney talks about her experience with MS ...,Ann Romney talks about her experience with MS,Ann Romney talks about her experience with MS,Ann Romney talks about her experience with MS
4059,Make sure your first marathon isn't your last!...,Make sure your first marathon isn't your last!...,Make sure your first marathon isn't your last!,Make sure your first marathon isn't your last!,Make sure your first marathon isn't your last!



Results



Unnamed: 0,K,SSE,ClusterSizes
0,7,625.327,Cluster 0 : 249 tweets Cluster 1 : 1564 tweet...
1,8,609.891,Cluster 0 : 1380 tweets Cluster 1 : 1562 twee...
2,9,585.524,Cluster 0 : 209 tweets Cluster 1 : 896 tweets...
3,10,578.414,Cluster 0 : 127 tweets Cluster 1 : 126 tweets...
4,11,574.5,Cluster 0 : 495 tweets Cluster 1 : 181 tweets...
5,12,545.3,Cluster 0 : 1019 tweets Cluster 1 : 44 tweets...


In [None]:
display(results)

Unnamed: 0,K,SSE,ClusterSizes
0,7,625.327,Cluster 0 : 249 tweets Cluster 1 : 1564 tweet...
1,8,609.891,Cluster 0 : 1380 tweets Cluster 1 : 1562 twee...
2,9,585.524,Cluster 0 : 209 tweets Cluster 1 : 896 tweets...
3,10,578.414,Cluster 0 : 127 tweets Cluster 1 : 126 tweets...
4,11,574.5,Cluster 0 : 495 tweets Cluster 1 : 181 tweets...
5,12,545.3,Cluster 0 : 1019 tweets Cluster 1 : 44 tweets...
