## Import Libraries 



In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Exploring Database

In [14]:
df = pd.read_csv("/content/drive/MyDrive/School [ALU]/Computer Science/Y_4/TERM2/Data Mining/Clustering_news_articles/dailymail_scrapped_articles_.csv")

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       50 non-null     int64 
 1   Article Title    50 non-null     object
 2   Article Link     50 non-null     object
 3   Article Content  50 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


# Data Pre-processing





In [17]:
documents = df['Article Content'].values.astype("U")

In [18]:
# Vectoring 
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

# Clustering

In [19]:
# Clustering
k = 5
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)

KMeans(max_iter=100, n_clusters=5, n_init=1)

In [25]:
# Adding new collumn in df for clusters
df['cluster'] = model.labels_
df.head(10)

Unnamed: 0.1,Unnamed: 0,Article Title,Article Link,Article Content,cluster
0,0,Facebook loses users for the first time EVER: ...,https://www.dailymail.co.uk/news/article-10471...,Facebook lost daily users for the first time i...,3
1,1,Jeff Zucker and girlfriend Allison Gollust bot...,https://www.dailymail.co.uk/news/article-10471...,CNN's president Jeff Zucker and his girlfriend...,2
2,2,Rudy Giuliani is revealed as star of The Maske...,https://www.dailymail.co.uk/tvshowbiz/article-...,Rudy Giuliani caused quite a stiras he was unm...,4
3,3,Manhattan public school principal CANCELS perf...,https://www.dailymail.co.uk/news/article-10470...,A middle school in Manhattan has canceled its ...,1
4,4,Covid symptoms appear just two days after bein...,https://www.dailymail.co.uk/news/article-10470...,Coronavirussymptoms appear just two days after...,4
5,5,GoFundMe FREEZES Canadian 'Freedom Convoy' pag...,https://www.dailymail.co.uk/news/article-10470...,A GoFundMe campaign that was organized to supp...,1
6,6,Georgetown Law is accused of hypocrisy after f...,https://www.dailymail.co.uk/news/article-10470...,Georgetown Law has been accused of having a do...,0
7,7,A REAL limousine liberal: Democrat Amy Klobuch...,https://www.dailymail.co.uk/news/article-10470...,A 'folksy' midwestern senator used campaign ca...,4
8,8,Tucker Carlson lashes Baltimore schools for de...,https://www.dailymail.co.uk/news/article-10470...,Fox News host Tucker Carlson swatted away excu...,4
9,9,Only plausible explanation for mysterious Hava...,https://www.dailymail.co.uk/news/article-10471...,U.S Intelligence officials examining Havana Sy...,4


# Evaluation


In [26]:
# See how the clustered articles are related by checking the centroids
print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :6]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 university
 school
 state
 ekren
 fluke
 georgetown
------------
Cluster 1:
 mostofsky
 cincinelli
 williams
 bieber
 said
 police
------------
Cluster 2:
 zucker
 jeff
 cnn
 cuomo
 allison
 gollust
------------
Cluster 3:
 rogan
 spotify
 young
 music
 meta
 podcast
------------
Cluster 4:
 said
 russia
 ukraine
 cases
 nato
 russian
------------




In [28]:
# Get csv files for each cluster
clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # create csv file
    data = clusters.get_group(cluster)[['Article Link','Article Title']] # get title and overview columns
    f.write(data.to_csv(index_label='id')) # set index to id
    f.close()