In [None]:
#installing pyspark
!pip -q install pyspark

[K     |████████████████████████████████| 212.3MB 78kB/s 
[K     |████████████████████████████████| 204kB 45.1MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
#Importing Necessary Libraries
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import *
import re
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler

In [None]:
path = '/content/dailykos.csv'# path of the dataset

In [None]:
sc = SparkSession.builder.master('local').appName('DailyKos').getOrCreate() #starting a spark session

In [None]:
df = sc.read.csv(path, inferSchema=True, header=True) # reading csv file
df.show(5)

+-------+---+-------+--------+--------+-------+---+-----+------+------+----------+-------+--------+-----------+-------+-----------+---+------+------+--------+------+---+-----+--------+-------+-----+--------------+-----+-------+---------+---------+-------+------+------+---------+-----------+------+---------+---+--------+------+-----+-----+---+---+---+------+----------+--------+------+-------+-----------+-------+-------+---------+-------+--------+------+---+--------+-------+---------+-----+-----+--------+------+------+---------+--------+------+----------+-------+---------+--------+--------+---+-----+----+----+-----+---+-----+--------+-------+---+---------+----+------+------+-------+----+-----+---+------+------+---------+------+-------+------+------+-------+------+---------+--------+--------+---+------+------+-------+-----+------+----------+---+-------+-------+----+------+---+----+---+-------+----+-----+------+------------+----+----+-----+-----+------+--------+--------+---+-------+---+---

In [None]:
df = df.withColumn('Id',monotonically_increasing_id())# adding a column of row number to use it later while joining the results

In [None]:
len(df.columns)# number of words given/ number of columns

1546

In [None]:
df.count() # number of documents/ number of rows

3430

In [None]:
dfcols = [re.sub("\.", "", c) for c in df.columns] # removing . from the column names

In [None]:
df = df.toDF(*dfcols) # adding cleaned columns to the dataframe

In [None]:
cols = df.columns[:-1]

In [None]:
# creating features using vector assembler
assembler = VectorAssembler(inputCols =cols , outputCol = 'features')
output = assembler.transform(df)
feature_dataframe = output.select(['id',"features"])
feature_dataframe.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
|  0|(1545,[17,22,26,7...|
|  1|(1545,[105,154,15...|
|  2|(1545,[5,11,25,30...|
|  3|(1545,[13,29,43,6...|
|  4|(1545,[22,27,43,5...|
+---+--------------------+
only showing top 5 rows



**Hierarchial Clustering: Bisecting KMeans**

In [None]:
bkm = BisectingKMeans(k=7, distanceMeasure='euclidean', seed=123) # calling bisecting KMeans with k=7

In [None]:
#fitting to the features dataframe
model_bkm = bkm.fit(feature_dataframe) 
predictions = model_bkm.transform(feature_dataframe) 

In [None]:
# evalutaing the distance
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.12263982988163974


In [None]:
# printing the cluster centers
centers = model_bkm.clusterCenters()

print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[0.0083682  0.02809325 0.01554094 ... 0.02390915 0.06933652 0.00956366]
[0.03396739 0.0298913  0.05027174 ... 0.04347826 0.01494565 0.03668478]
[0.03030303 0.06666667 0.05656566 ... 0.05656566 0.15353535 0.02424242]
[0.05851064 0.0212766  0.07446809 ... 0.04255319 0.30319149 0.        ]
[0.01030928 0.04123711 0.02061856 ... 0.03092784 0.01030928 0.01030928]
[0.01342282 0.02013423 0.01342282 ... 0.02684564 0.89261745 0.02013423]
[0.02173913 0.06521739 0.02173913 ... 0.         0.15217391 0.02173913]


In [None]:
predictions.show() # showing the predictions

+---+--------------------+----------+
| id|            features|prediction|
+---+--------------------+----------+
|  0|(1545,[17,22,26,7...|         1|
|  1|(1545,[105,154,15...|         0|
|  2|(1545,[5,11,25,30...|         4|
|  3|(1545,[13,29,43,6...|         1|
|  4|(1545,[22,27,43,5...|         2|
|  5|(1545,[105,112,18...|         0|
|  6|(1545,[66,73,98,9...|         2|
|  7|(1545,[105,182,18...|         0|
|  8|(1545,[22,74,105,...|         0|
|  9|(1545,[105,154,15...|         0|
| 10|(1545,[23,45,96,1...|         0|
| 11|(1545,[5,11,25,30...|         6|
| 12|(1545,[11,25,30,5...|         4|
| 13|(1545,[60,105,109...|         0|
| 14|(1545,[21,23,36,6...|         1|
| 15|(1545,[18,19,23,4...|         3|
| 16|(1545,[22,55,64,7...|         0|
| 17|(1545,[22,55,64,7...|         0|
| 18|(1545,[10,41,49,5...|         0|
| 19|(1545,[10,13,49,5...|         1|
+---+--------------------+----------+
only showing top 20 rows



In [None]:
# using the Id column which we created previously to join the prediction column to the dataframe
df_pred = df.join(predictions.select(['Id','prediction']),'Id')

In [None]:
df_pred.show(5) # last column is prediction column

+---+-------+---+-------+--------+--------+-------+---+-----+------+------+----------+-------+--------+-----------+-------+-----------+---+------+------+--------+------+---+-----+--------+-------+-----+--------------+-----+-------+---------+---------+-------+------+------+---------+-----------+------+---------+---+--------+------+-----+-----+---+---+---+------+----------+--------+------+-------+-----------+-------+-------+---------+-------+--------+------+---+--------+-------+---------+-----+-----+--------+------+------+---------+--------+------+----------+-------+---------+--------+--------+---+-----+----+----+-----+---+-----+--------+-------+---+---------+----+------+------+-------+----+-----+---+------+------+---------+------+-------+------+------+-------+------+---------+--------+--------+---+------+------+-------+-----+------+----------+---+-------+-------+----+------+---+----+---+-------+----+-----+------+------------+----+----+-----+-----+------+--------+--------+---+-------+---

In [None]:
# filtering all the clusters and creating 7 dataframes
df_cluster1 = df_pred.filter(df_pred.prediction==0)
df_cluster2 = df_pred.filter(df_pred.prediction==1)
df_cluster3 = df_pred.filter(df_pred.prediction==2)
df_cluster4 = df_pred.filter(df_pred.prediction==3)
df_cluster5 = df_pred.filter(df_pred.prediction==4)
df_cluster6 = df_pred.filter(df_pred.prediction==5)
df_cluster7 = df_pred.filter(df_pred.prediction==6)

In [None]:
# no of observations in each cluster
print(f'The number of obsevations in culster 1 {df_cluster1.count()}')
print(f'The number of obsevations in culster 2 {df_cluster2.count()}')
print(f'The number of obsevations in culster 3 {df_cluster3.count()}')
print(f'The number of obsevations in culster 4 {df_cluster4.count()}')
print(f'The number of obsevations in culster 5 {df_cluster5.count()}')
print(f'The number of obsevations in culster 6 {df_cluster6.count()}')
print(f'The number of obsevations in culster 7 {df_cluster7.count()}')

The number of obsevations in culster 1 1673
The number of obsevations in culster 2 736
The number of obsevations in culster 3 495
The number of obsevations in culster 4 188
The number of obsevations in culster 5 97
The number of obsevations in culster 6 149
The number of obsevations in culster 7 92


**Frequent words of Hierachial Cluster 1**

In [None]:
 mean_words_1 = df_cluster1.describe()
 mean_words_1 = mean_words_1.filter(mean_words_1['summary']=='mean')
 list_words_1 = [(k , float((mean_words_1.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_1.sort(key= lambda x: x[1], reverse=True)
 list_words_1[:6]


[('bush', 1.078302450687388),
 ('kerry', 0.9527794381350867),
 ('poll', 0.8487746563060371),
 ('democrat', 0.6120741183502689),
 ('general', 0.488344291691572),
 ('dean', 0.484160191273162)]

**Frequent words of Hierachial Cluster 2**

In [None]:
 mean_words_2 = df_cluster2.describe()
 mean_words_2 = mean_words_2.filter(mean_words_2['summary']=='mean')
 list_words_2 = [(k , float((mean_words_2.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_2.sort(key= lambda x: x[1], reverse=True)
 list_words_2[:6]


[('democrat', 1.453804347826087),
 ('bush', 1.328804347826087),
 ('republican', 1.3233695652173914),
 ('state', 1.2404891304347827),
 ('iraq', 1.1807065217391304),
 ('elect', 1.0692934782608696)]

**Frequent words of Hierachial Cluster 3**

In [None]:
 mean_words_3 = df_cluster3.describe()
 mean_words_3 = mean_words_3.filter(mean_words_3['summary']=='mean')
 list_words_3 = [(k , float((mean_words_3.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_3.sort(key= lambda x: x[1], reverse=True)
 list_words_3[:6]


[('bush', 7.602020202020202),
 ('kerry', 3.9515151515151516),
 ('poll', 1.8868686868686868),
 ('presided', 1.8868686868686868),
 ('iraq', 1.612121212121212),
 ('war', 1.3636363636363635)]

**Frequent words of Hierachial Cluster 4**

In [None]:
 mean_words_4 = df_cluster4.describe()
 mean_words_4 = mean_words_4.filter(mean_words_4['summary']=='mean')
 list_words_4 = [(k , float((mean_words_4.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_4.sort(key= lambda x: x[1], reverse=True)
 list_words_4[:6]


[('democrat', 5.579787234042553),
 ('dean', 4.5212765957446805),
 ('kerry', 3.595744680851064),
 ('state', 2.984042553191489),
 ('parties', 2.845744680851064),
 ('republican', 2.6382978723404253)]

**Frequent words of Hierachial Cluster 5**

In [None]:
 mean_words_5 = df_cluster5.describe()
 mean_words_5 = mean_words_5.filter(mean_words_5['summary']=='mean')
 list_words_5 = [(k , float((mean_words_5.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_5.sort(key= lambda x: x[1], reverse=True)
 list_words_5[:6]


[('november', 10.206185567010309),
 ('vote', 5.34020618556701),
 ('poll', 4.123711340206185),
 ('challenge', 4.010309278350515),
 ('republican', 2.4742268041237114),
 ('senate', 2.288659793814433)]

**Frequent words of Hierachial Cluster 6**

In [None]:
 mean_words_6 = df_cluster6.describe()
 mean_words_6 = mean_words_6.filter(mean_words_6['summary']=='mean')
 list_words_6 = [(k , float((mean_words_6.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_6.sort(key= lambda x: x[1], reverse=True)
 list_words_6[:6]


[('november', 10.543624161073826),
 ('poll', 5.053691275167785),
 ('challenge', 4.100671140939597),
 ('democrat', 2.8322147651006713),
 ('vote', 2.5436241610738257),
 ('house', 2.4563758389261743)]

**Frequent words of Hierachial Cluster 7**

In [None]:
 mean_words_7 = df_cluster7.describe()
 mean_words_7 = mean_words_7.filter(mean_words_7['summary']=='mean')
 list_words_7 = [(k , float((mean_words_7.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_7.sort(key= lambda x: x[1], reverse=True)
 list_words_7[:6]


[('november', 9.532608695652174),
 ('vote', 6.510869565217392),
 ('bush', 6.119565217391305),
 ('poll', 5.489130434782608),
 ('democrat', 4.956521739130435),
 ('republican', 4.282608695652174)]

**Kmeans Clustering** 

In [None]:
#initiating Kmeans with k=7
km = KMeans(k=7, distanceMeasure='euclidean', seed=123,predictionCol='kmeansPrediction')

In [None]:
#fitting the model to the features dataframe
model_km = km.fit(feature_dataframe)
predictions_km = model_km.transform(feature_dataframe)

In [None]:
# evaluating the euclidean distance
evaluator_km = ClusteringEvaluator(predictionCol='kmeansPrediction')
silhouette_km = evaluator_km.evaluate(predictions_km)
print("Silhouette with squared euclidean distance = " + str(silhouette_km))

Silhouette with squared euclidean distance = 0.26533205615575045


In [None]:
# showing predictions
predictions_km.show()

+---+--------------------+----------------+
| id|            features|kmeansPrediction|
+---+--------------------+----------------+
|  0|(1545,[17,22,26,7...|               1|
|  1|(1545,[105,154,15...|               1|
|  2|(1545,[5,11,25,30...|               2|
|  3|(1545,[13,29,43,6...|               1|
|  4|(1545,[22,27,43,5...|               6|
|  5|(1545,[105,112,18...|               1|
|  6|(1545,[66,73,98,9...|               6|
|  7|(1545,[105,182,18...|               1|
|  8|(1545,[22,74,105,...|               1|
|  9|(1545,[105,154,15...|               1|
| 10|(1545,[23,45,96,1...|               1|
| 11|(1545,[5,11,25,30...|               2|
| 12|(1545,[11,25,30,5...|               2|
| 13|(1545,[60,105,109...|               1|
| 14|(1545,[21,23,36,6...|               0|
| 15|(1545,[18,19,23,4...|               0|
| 16|(1545,[22,55,64,7...|               1|
| 17|(1545,[22,55,64,7...|               1|
| 18|(1545,[10,41,49,5...|               1|
| 19|(1545,[10,13,49,5...|      

In [None]:
#joining the prediction column to the dataframe
df_pred_km = df.join(predictions_km.select(['Id','kmeansPrediction']),'Id')

In [None]:
df_pred_km.show(5)# last colum is the kmeans prediction column

+---+-------+---+-------+--------+--------+-------+---+-----+------+------+----------+-------+--------+-----------+-------+-----------+---+------+------+--------+------+---+-----+--------+-------+-----+--------------+-----+-------+---------+---------+-------+------+------+---------+-----------+------+---------+---+--------+------+-----+-----+---+---+---+------+----------+--------+------+-------+-----------+-------+-------+---------+-------+--------+------+---+--------+-------+---------+-----+-----+--------+------+------+---------+--------+------+----------+-------+---------+--------+--------+---+-----+----+----+-----+---+-----+--------+-------+---+---------+----+------+------+-------+----+-----+---+------+------+---------+------+-------+------+------+-------+------+---------+--------+--------+---+------+------+-------+-----+------+----------+---+-------+-------+----+------+---+----+---+-------+----+-----+------+------------+----+----+-----+-----+------+--------+--------+---+-------+---

In [None]:
# filtering the custers and creating 7 dataframes
df_km_cluster1 = df_pred_km.filter(df_pred_km.kmeansPrediction==0)
df_km_cluster2 = df_pred_km.filter(df_pred_km.kmeansPrediction==1)
df_km_cluster3 = df_pred_km.filter(df_pred_km.kmeansPrediction==2)
df_km_cluster4 = df_pred_km.filter(df_pred_km.kmeansPrediction==3)
df_km_cluster5 = df_pred_km.filter(df_pred_km.kmeansPrediction==4)
df_km_cluster6 = df_pred_km.filter(df_pred_km.kmeansPrediction==5)
df_km_cluster7 = df_pred_km.filter(df_pred_km.kmeansPrediction==6)

In [None]:
# count of observations in each cluster
print(f'The number of obsevations in culster 1 {df_km_cluster1.count()}')
print(f'The number of obsevations in culster 2 {df_km_cluster2.count()}')
print(f'The number of obsevations in culster 3 {df_km_cluster3.count()}')
print(f'The number of obsevations in culster 4 {df_km_cluster4.count()}')
print(f'The number of obsevations in culster 5 {df_km_cluster5.count()}')
print(f'The number of obsevations in culster 6 {df_km_cluster6.count()}')
print(f'The number of obsevations in culster 7 {df_km_cluster7.count()}')

The number of obsevations in culster 1 372
The number of obsevations in culster 2 2084
The number of obsevations in culster 3 330
The number of obsevations in culster 4 41
The number of obsevations in culster 5 13
The number of obsevations in culster 6 163
The number of obsevations in culster 7 427


**Frequent words of KMeans cluster 1**

In [None]:
 mean_words_km_1 = df_km_cluster1.describe()
 mean_words_km_1 = mean_words_km_1.filter(mean_words_km_1['summary']=='mean')
 list_words_km_1 = [(k , float((mean_words_km_1.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_1.sort(key= lambda x: x[1], reverse=True)
 list_words_km_1[:6]

[('democrat', 2.956989247311828),
 ('republican', 2.78494623655914),
 ('state', 1.924731182795699),
 ('elect', 1.913978494623656),
 ('parties', 1.6908602150537635),
 ('senate', 1.5806451612903225)]

**Frequent words of KMeans cluster 2**

In [None]:
 mean_words_km_2 = df_km_cluster2.describe()
 mean_words_km_2 = mean_words_km_2.filter(mean_words_km_2['summary']=='mean')
 list_words_km_2 = [(k , float((mean_words_km_2.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_2.sort(key= lambda x: x[1], reverse=True)
 list_words_km_2[:6]

[('bush', 1.248560460652591),
 ('kerry', 0.755278310940499),
 ('poll', 0.6799424184261037),
 ('iraq', 0.6223608445297505),
 ('war', 0.5911708253358925),
 ('democrat', 0.5873320537428023)]

**Frequent words of KMeans cluster 3**

In [None]:
 mean_words_km_3 = df_km_cluster3.describe()
 mean_words_km_3 = mean_words_km_3.filter(mean_words_km_3['summary']=='mean')
 list_words_km_3 = [(k , float((mean_words_km_3.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_3.sort(key= lambda x: x[1], reverse=True)
 list_words_km_3[:6]

[('november', 10.36969696969697),
 ('poll', 4.863636363636363),
 ('vote', 4.4393939393939394),
 ('challenge', 4.127272727272727),
 ('bush', 3.081818181818182),
 ('democrat', 2.8666666666666667)]

**Frequent words of KMeans cluster 4**

In [None]:
 mean_words_km_4 = df_km_cluster4.describe()
 mean_words_km_4 = mean_words_km_4.filter(mean_words_km_4['summary']=='mean')
 list_words_km_4 = [(k , float((mean_words_km_4.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_4.sort(key= lambda x: x[1], reverse=True)
 list_words_km_4[:6]

[('democrat', 15.21951219512195),
 ('parties', 6.365853658536586),
 ('republican', 6.195121951219512),
 ('state', 5.146341463414634),
 ('senate', 4.2682926829268295),
 ('seat', 4.097560975609756)]

**Frequent words of KMeans cluster 5**

In [None]:
 mean_words_km_5 = df_km_cluster5.describe()
 mean_words_km_5 = mean_words_km_5.filter(mean_words_km_5['summary']=='mean')
 list_words_km_5 = [(k , float((mean_words_km_5.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_5.sort(key= lambda x: x[1], reverse=True)
 list_words_km_5[:6]

[('marriage', 6.769230769230769),
 ('amendment', 5.153846153846154),
 ('gay', 4.923076923076923),
 ('bush', 4.076923076923077),
 ('state', 3.6153846153846154),
 ('vote', 3.3846153846153846)]

**Frequent words of KMeans cluster 6**

In [None]:
 mean_words_km_6 = df_km_cluster6.describe()
 mean_words_km_6 = mean_words_km_6.filter(mean_words_km_6['summary']=='mean')
 list_words_km_6 = [(k , float((mean_words_km_6.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_6.sort(key= lambda x: x[1], reverse=True)
 list_words_km_6[:6]

[('dean', 7.466257668711656),
 ('kerry', 5.214723926380368),
 ('clark', 2.950920245398773),
 ('edward', 2.785276073619632),
 ('democrat', 2.576687116564417),
 ('poll', 2.3987730061349692)]

**Frequent words of KMeans cluster 7**

In [None]:
 mean_words_km_7 = df_km_cluster7.describe()
 mean_words_km_7 = mean_words_km_7.filter(mean_words_km_7['summary']=='mean')
 list_words_km_7 = [(k , float((mean_words_km_7.select(k).collect()[0])[k])) for k in df.columns if k not in ['summary','Id','prediction']]
 list_words_km_7.sort(key= lambda x: x[1], reverse=True)
 list_words_km_7[:6]

[('bush', 8.355971896955504),
 ('kerry', 4.19672131147541),
 ('presided', 1.9250585480093676),
 ('poll', 1.901639344262295),
 ('iraq', 1.6604215456674474),
 ('administration', 1.3559718969555035)]

**Answering questions based on above observations for Hierarchial Clustering**

* Which cluster has the most observations?

Cluster 1

* Which cluster has the fewest observations?

Cluster 7

* Which cluster could best be described as the cluster related to the Iraq war?

Cluster 3 has the 5th most frequent word as Iraq and 6th most frequent word as war. so answer to this question is cluster 3.

* In 2004, one of the candidates for the Democratic nomination for the President of the United States
was Howard Dean, John Kerry was the candidate who won the democratic nomination, and John
Edwards with the running mate of John Kerry (the Vice President nominee). Given this information,
which cluster best corresponds to the democratic party?

In cluster 4 democrat, dean and kerry are the most frequent words which tells us that cluster 4 best corresponds to the democrat party

**Answering questions based on above observations for KMeans Clustering**

* How many observations are in Cluster 3?

330

* Which cluster has the most observations?

Cluster 2

* Which cluster has the fewest number of observations?

Cluster 5

* Which k-means cluster best corresponds to the Iraq War?

In cluster 2 Iraq is the 4th most frequent word war is the 5th most frequent word. so answer to this question is cluster 2

* Which k-means cluster best corresponds to the democratic party? (Remember that we are looking
for the names of the key democratic party leaders.)

Cluster 6 has kerry, clark, dean and edward as the most frequent occuring words.

**Compare how observations were assigned to clusters in the two different methods. Compare the clus
ter assignment of Bisecting k-means clustering to the cluster assignment of k-means clustering. Which
Bisecting k-means Cluster best corresponds to K-Means Cluster 3?**

Cluster 6 of Bisecting K-means has the most frequently occuring words as november, poll, challenge, democrat, vote and house which are also the frequently occuring words in cluster 3 of K-Means