## Exercise 5.4

### Imports

In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from string import punctuation
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lmalv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lmalv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data

In [3]:
# Load data
data_path = 'data/scopusabstracts.csv'
reader = open(data_path, 'r', encoding='utf-8')
reader.readline() # skip header
lines = reader.readlines()
text = [i.split('#')[1] + ' ' + i.split('#')[2] for i in lines]

### Data preprocessing

In [4]:
# tokenization
tokens_list = [word_tokenize(i) for i in text]

# lowercasing
lc_tokens_list = []    
for i in tokens_list: 
    lc_tokens_list.append([token.lower() for token in i])

# removing stopwords, punctuation, and numbers
stop_words = set(stopwords.words('english'))
stop_words.update(punctuation)
stop_words.add("...")
filtered_sentence = []    
for i in lc_tokens_list: 
    filtered_sentence.append([token for token in i if token not in stop_words])
filtered_sentence = [' '.join(i) for i in filtered_sentence]
filtered_sentence = [re.sub(r'\d+', '', sentence) for sentence in filtered_sentence]

# stemming
porter = PorterStemmer()
stemmed_tokens_list = []
for i in filtered_sentence:
	stemmed_tokens_list.append([porter.stem(j) for j in i.split()])

### Preview preprocessed data

In [5]:
# show processed data
print('First ten rows of data after preprocessing:')
for i in stemmed_tokens_list[:10]:
	for j in i:
		print(j,end=" ")
	print(" ")

# number of tokens
uniques = np.unique([tok for doc in stemmed_tokens_list for tok in doc])
print("\nThe number of tokens after preprocessing is {}.".format(len(uniques)))

# check most frequent words
listofall = [item for elem in stemmed_tokens_list for item in elem]
freq = FreqDist(listofall)
wnum = freq.B()
print("\nMost common words (total %d):"%wnum)
print(freq.most_common(100))

First ten rows of data after preprocessing:
anomali detect wide area imageri geniş alan görüntülerind anomali tespiti studi detect anomali wide area imageri collect aircraft set anomali identifi anyth normal cours action purpos two differ data set use experi carri data set anomali detect convolut neural network model tri gener next imag use past imag design imag pre-process given model anomali detect perform compar estim imag true imag  
person re-identif deep kronecker-product match group-shuffl random walk person re-identif re-id aim robustli measur visual affin person imag wide applic intellig surveil associ person imag across multipl camera gener treat imag retriev problem given probe person imag affin probe imag galleri imag pg affin use rank retriev galleri imag exist two main challeng effect solv problem person imag usual show signific variat differ person pose view angl spatial layout correspond person imag therefor vital inform tackl problem state-of-the-art method either igno

### tf-idf

In [None]:
cleaned_documents = [ ' '.join(i) for i in stemmed_tokens_list]
tfidf_vectorizer = TfidfVectorizer(smooth_idf = False, ngram_range = (1, 2) , min_df = 0.01, max_df = 0.95)

tfidf_vectorizer.fit(cleaned_documents)
tf_idf_vectors = tfidf_vectorizer.transform(cleaned_documents)

print(f"\nThe number of samples is {tf_idf_vectors.shape[0]} and the number of features is {tf_idf_vectors.shape[1]}.")


The number of samples is 1143 and the number of features is 1656.


### Finding best K for clustering

In [7]:
k_values = range(3, 11)
db_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters = k, random_state = 42)
    labels = kmeans.fit_predict(tf_idf_vectors)

    db_score = davies_bouldin_score(tf_idf_vectors.toarray(), labels)
    db_scores.append(db_score)

best_k = k_values[np.argmin(db_scores)]
print(f"The best Davies-Bouldin index is {min(db_scores):.4f} with K = {best_k}.")

The best Davies-Bouldin index is 5.2867 with K = 9.


### Assessing the clustering