## Cleaning the Data Set

In [2]:
#import os
import os

In [3]:
#Define directories
current_dir = os.getcwd()
novel_algorithm_dir = os.path.dirname(current_dir)
data_sets_dir = os.path.join(novel_algorithm_dir, 'Data Sets')

print (current_dir)
print (novel_algorithm_dir)
print (data_sets_dir)

e:\Git\Data_Science_Portfolio\Novel Algorithm\Codes
e:\Git\Data_Science_Portfolio\Novel Algorithm
e:\Git\Data_Science_Portfolio\Novel Algorithm\Data Sets


In [4]:
#Import pandas
import pandas as pd

In [5]:
#Define the relative path to the CSV file
csv_file_name = '2016_US_election_tweets_100k.csv'
csv_file_path = os.path.join(data_sets_dir, csv_file_name)

#Check if the CSV file exists before reading it
if os.path.isfile(csv_file_path):
    #Read the CSV file into a DataFrame
    tweets_data = pd.read_csv(csv_file_path)
    print(tweets_data.head())
else:
    print(f"CSV file '{csv_file_name}' does not exist in the 'Database' folder.")

      id  candidate_id            tweet_id  polarity  subjectivity  \
0  57486             3  770728672433143808  0.000000      0.000000   
1  57536             3  770729424207618048  0.000000      0.000000   
2  57586             3  770730086978977796 -0.472222      0.555556   
3  57632             2  770748036972773376  0.000000      0.000000   
4  57682             1  770748065531723778 -0.125000      0.375000   

   retweet_count  favorite_count  device  retweeted_status_id lang state  \
0              0               0       1                  NaN   tr   NaN   
1              0               0       1                  NaN   en   NaN   
2              0               0       1                  NaN   en   NaN   
3              0               0       0                  NaN   en   NaN   
4              0               0       5                  NaN   en   NaN   

                                          tweet_text           created_at  \
0  Ne farkınız var DAIŞ, El-Kaide, El Nusra, 

In [6]:
#Get disired columns
df = tweets_data[['polarity','subjectivity']]
df

#remove rows with zero values for both columns
df = df[(df.T != 0).any()]
df

Unnamed: 0,polarity,subjectivity
2,-0.472222,0.555556
4,-0.125000,0.375000
5,0.100000,0.400000
6,0.200000,0.500000
7,-0.406250,0.843750
...,...,...
99991,0.000000,1.000000
99993,0.800000,0.750000
99994,-0.266667,0.733333
99996,-0.150000,0.200000


In [7]:
df_old = df.copy()
df_new = df.copy()

## Traditional K-means Clustering for Community Detection

In [8]:
df_old

Unnamed: 0,polarity,subjectivity
2,-0.472222,0.555556
4,-0.125000,0.375000
5,0.100000,0.400000
6,0.200000,0.500000
7,-0.406250,0.843750
...,...,...
99991,0.000000,1.000000
99993,0.800000,0.750000
99994,-0.266667,0.733333
99996,-0.150000,0.200000


In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [10]:
#Custom Euclidean distance function
def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

In [11]:
#Custom KMeans class
class CustomKMeans(KMeans):
    def _transform(self, X):
        return np.array([euclidean_distance(X, center) for center in self.cluster_centers_])

In [12]:
#Specify the number of clusters
k = 4  

#Initialize the custom K-means
kmeans_old = CustomKMeans(n_clusters=k, init='random', random_state=0)

#Fit K-means to the data
kmeans_old.fit(df_old)

#Add cluster labels to the DataFrame
df_old['cluster'] = kmeans_old.labels_

#Access the cluster assignments in the 'cluster' column
print(df_old)



       polarity  subjectivity  cluster
2     -0.472222      0.555556        2
4     -0.125000      0.375000        1
5      0.100000      0.400000        1
6      0.200000      0.500000        1
7     -0.406250      0.843750        0
...         ...           ...      ...
99991  0.000000      1.000000        2
99993  0.800000      0.750000        3
99994 -0.266667      0.733333        2
99996 -0.150000      0.200000        1
99998  0.800000      0.750000        3

[50912 rows x 3 columns]


In [13]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

#Silhouette Score
silhouette_avg = silhouette_score(df_old, kmeans_old.labels_)
print(f"Silhouette Score: {silhouette_avg}")

#Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(df_old, kmeans_old.labels_)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

#Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(df_old, kmeans_old.labels_)
print(f"Davies-Bouldin Index: {davies_bouldin}")

#Inertia
inertia = kmeans_old.inertia_
print(f"Inertia: {inertia}")

Silhouette Score: 0.7471416860185163
Calinski-Harabasz Index: 326325.9562386966
Davies-Bouldin Index: 0.3735642685219846
Inertia: 3062.5300119627705


## New Algorithm Methodolgy of K-means Clustering for Community Detection

In [14]:
df_new

Unnamed: 0,polarity,subjectivity
2,-0.472222,0.555556
4,-0.125000,0.375000
5,0.100000,0.400000
6,0.200000,0.500000
7,-0.406250,0.843750
...,...,...
99991,0.000000,1.000000
99993,0.800000,0.750000
99994,-0.266667,0.733333
99996,-0.150000,0.200000


In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [16]:
#Custom Katz distance function
def katz_distance(point1, point2, A, beta=0.5):
    S = np.linalg.inv(np.eye(A.shape[0]) - beta * A)
    katz_distance = S[point1, point2]
    return katz_distance

In [17]:
#Custom KMeans class
class CustomKMeans(KMeans):
    def _transform(self, X):
        return np.array([katz_distance(X, center, A, beta=0.5) for center in self.cluster_centers_])

In [18]:
#specify the number of clusters
k = 4

In [19]:
#Initialize the custom K-means with spectral clustering results
kmeans_new = CustomKMeans(n_clusters=k, init='k-means++', random_state=0)

In [20]:
#Fit K-means to the data
kmeans_new.fit(df_new)



In [21]:

#Add cluster labels to the DataFrame
df_new['cluster'] = kmeans_new.labels_

#Access the cluster assignments in the 'cluster' column
print(df_new)

       polarity  subjectivity  cluster
2     -0.472222      0.555556        3
4     -0.125000      0.375000        2
5      0.100000      0.400000        2
6      0.200000      0.500000        2
7     -0.406250      0.843750        0
...         ...           ...      ...
99991  0.000000      1.000000        3
99993  0.800000      0.750000        1
99994 -0.266667      0.733333        3
99996 -0.150000      0.200000        2
99998  0.800000      0.750000        1

[50912 rows x 3 columns]


In [22]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

#Silhouette Score
silhouette_avg = silhouette_score(df_new, kmeans_new.labels_)
print(f"Silhouette Score: {silhouette_avg}")

#Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(df_new, kmeans_new.labels_)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

#Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(df_new, kmeans_new.labels_)
print(f"Davies-Bouldin Index: {davies_bouldin}")

#Inertia
inertia = kmeans_new.inertia_
print(f"Inertia: {inertia}")

Silhouette Score: 0.7471060313989988
Calinski-Harabasz Index: 299860.37213536917
Davies-Bouldin Index: 0.36555131212671843
Inertia: 3062.5363355000295
