## Exercise 5.4

### Imports

In [682]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from string import punctuation
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering
import pandas as pd

In [683]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vilmatiainen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vilmatiainen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data

In [684]:
# Load data
data_path = 'data/scopusabstracts.csv'
reader = open(data_path, 'r', encoding='utf-8')
reader.readline() # skip header
lines = reader.readlines()
text = [i.split('#')[1] + ' ' + i.split('#')[2] for i in lines]

### Data preprocessing

In [685]:
# Run baseline solution. Set False for improved solution
baseline = False
# Run failed attempts at improvement
failed2 = False # LSA

# tokenization
tokens_list = [word_tokenize(i) for i in text]

# lowercasing
lc_tokens_list = []    
for i in tokens_list: 
    lc_tokens_list.append([token.lower() for token in i])

# removing stopwords, punctuation, and numbers
stop_words = set(stopwords.words('english'))
stop_words.update(punctuation)
stop_words.add("...")

if not baseline:
    # Adding stopwords which were problematic in later steps
    stop_words.add("also")
    stop_words.update(["use", "used", "uses", "using"])
    stop_words.update(["paper", "papers"])
    stop_words.update(["result", "results", "resulted", "resulting"])
    stop_words.update(["base", "based"])

filtered_sentence = []
for i in lc_tokens_list: 
    filtered_sentence.append([token for token in i if token not in stop_words])
filtered_sentence = [' '.join(i) for i in filtered_sentence]
filtered_sentence = [re.sub(r'\d+', '', sentence) for sentence in filtered_sentence]

if not baseline:
    # Transform hyphenated terms like state-of-the-art to stateoftheart and remove '-based' from terms like 'CS-based'
    filtered_sentence = [t.replace('-based', '').replace('-', '') for t in filtered_sentence]

# stemming
porter = PorterStemmer()
stemmed_tokens_list = []
for i in filtered_sentence:
	stemmed_tokens_list.append([porter.stem(j) for j in i.split()])

### Preview preprocessed data

In [686]:
# show processed data
print('First ten rows of data after preprocessing:')
for i in stemmed_tokens_list[:10]:
	for j in i:
		print(j,end=" ")
	print(" ")

# number of tokens
uniques = np.unique([tok for doc in stemmed_tokens_list for tok in doc])
print("\nThe number of tokens after preprocessing is {}.".format(len(uniques)))

# check most frequent words
listofall = [item for elem in stemmed_tokens_list for item in elem]
freq = FreqDist(listofall)
wnum = freq.B()
print("\nMost common words (total %d):"%wnum)
print(freq.most_common(100))

First ten rows of data after preprocessing:
anomali detect wide area imageri geniş alan görüntülerind anomali tespiti studi detect anomali wide area imageri collect aircraft set anomali identifi anyth normal cours action purpos two differ data set experi carri data set anomali detect convolut neural network model tri gener next imag past imag design imag preprocess given model anomali detect perform compar estim imag true imag  
person reidentif deep kroneckerproduct match groupshuffl random walk person reidentif reid aim robustli measur visual affin person imag wide applic intellig surveil associ person imag across multipl camera gener treat imag retriev problem given probe person imag affin probe imag galleri imag pg affin rank retriev galleri imag exist two main challeng effect solv problem person imag usual show signific variat differ person pose view angl spatial layout correspond person imag therefor vital inform tackl problem stateoftheart method either ignor spatial variat util

### tf-idf

In [687]:
cleaned_documents = [ ' '.join(i) for i in stemmed_tokens_list]
ngram_range = (1, 2)
if not baseline: # Add trigrams
  ngram_range = (1, 3)
tfidf_vectorizer = TfidfVectorizer(smooth_idf = False, ngram_range = ngram_range , min_df = 5, max_df = 0.8, norm='l2')

tfidf_vectorizer.fit(cleaned_documents)
tf_idf_vectors = tfidf_vectorizer.transform(cleaned_documents)

# Run LSA (worsens results so failed attempt)
if not baseline and failed2:
  lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
  tf_idf_vectors = csr_matrix(lsa.fit_transform(tf_idf_vectors))

print(f"\nThe number of samples is {tf_idf_vectors.shape[0]} and the number of features is {tf_idf_vectors.shape[1]}.")


The number of samples is 1143 and the number of features is 3654.


### Buckshot

In [688]:
def buckshot(X, n_clusters, random_state):
  n_seeds = int(np.sqrt(X.shape[0]*n_clusters))
  idx = random_state.randint(X.shape[0], size=n_seeds)
  agg_data = X.toarray()[idx, :]
  agg = AgglomerativeClustering(n_clusters=n_clusters, metric='cosine', linkage='complete').fit(agg_data)
  
  # Use pandas to group by labels and get means
  agg_df = pd.DataFrame(agg_data)
  agg_df['label'] = agg.labels_

  means_df = agg_df.groupby('label').mean()
  return means_df.values # return numpy array
  

### Finding best K for clustering

In [689]:
k_values = range(3, 11)
db_scores = []
labels_list = []

for k in k_values:
    if baseline:
        init = 'k-means++'
    else:
        init = buckshot
    kmeans = KMeans(n_clusters = k, random_state = 42, init=init)
    labels = kmeans.fit_predict(tf_idf_vectors)
    labels_list.append(labels)
    db_score = davies_bouldin_score(tf_idf_vectors.toarray(), labels)
    db_scores.append(db_score)

i_best = np.argmin(db_scores)
best_k = k_values[i_best]
print(f"The best Davies-Bouldin index is {min(db_scores):.4f} with K = {best_k}.")

The best Davies-Bouldin index is 5.7921 with K = 7.


### Clustering results

In [690]:
best_labels = labels_list[i_best]
features = tfidf_vectorizer.inverse_transform(tf_idf_vectors)
clusters = {}

for label, feature in zip(best_labels, features):
    if label not in clusters:
        clusters[label] = []
    clusters[label].extend(feature)

clusters = dict(sorted(clusters.items()))

# Join similar clusters
if not baseline:
    thresh = 15
    merger = set()
    # Calculate percentage overlap of topical words between all clusters and add to merger if they are above thresh
    for i in clusters.keys():
        for j in clusters.keys():
            if i != j:
                set1 = set([w[0] for w in FreqDist([str(item) for item in clusters[i] if len(item.split()) == 1]).most_common(10)])
                set1.update(set([w[0] for w in FreqDist([str(item) for item in clusters[i] if len(item.split()) == 2]).most_common(10)]))
                set1.update(set([w[0] for w in FreqDist([str(item) for item in clusters[i] if len(item.split()) == 3]).most_common(10)]))
                set2 = set([w[0] for w in FreqDist([str(item) for item in clusters[j] if len(item.split()) == 1]).most_common(10)])
                set2.update(set([w[0] for w in FreqDist([str(item) for item in clusters[j] if len(item.split()) == 2]).most_common(10)]))
                set2.update(set([w[0] for w in FreqDist([str(item) for item in clusters[j] if len(item.split()) == 3]).most_common(10)]))
                intersection = set1.intersection(set2)
                union = set1.union(set2)
                if (len(intersection) / len(union)) * 100 >= thresh:
                    merger.add(frozenset([i, j]))
    # Go through clusters to be merged and merge them
    for i in list(merger):
        pair = list(i)
        c1 = pair[0]
        c2 = pair[1]
        if (c1 in clusters) and (c2 in clusters):
            if (c1 < c2):
                clusters[c1].extend(clusters[c2])
                clusters.pop(c2)
    # Rename clusters to remove gaps from merged clusters
    clust_name = 0
    for clust in list(clusters.keys()):
        clusters[clust_name] = clusters.pop(clust)
        clust_name += 1


for cluster in clusters:
    unigrams = [str(item) for item in clusters[cluster] if len(item.split()) == 1]
    bigrams = [str(item) for item in clusters[cluster] if len(item.split()) == 2]

    unigram_freq = FreqDist(unigrams)
    bigram_freq = FreqDist(bigrams)

    print(f"The most frequent unigrams and bigrams for cluster {cluster}.")
    print(f"Unigrams: {unigram_freq.most_common(10)}")
    if not baseline:
        trigrams = [str(item) for item in clusters[cluster] if len(item.split()) == 3]
        trigram_freq = FreqDist(trigrams)
        print(f"Bigrams: {bigram_freq.most_common(10)}")
        print(f"Trigrams: {trigram_freq.most_common(10)}\n")
    else:
        print(f"Bigrams: {bigram_freq.most_common(10)}\n")

The most frequent unigrams and bigrams for cluster 0.
Unigrams: [('propos', 214), ('method', 210), ('comput', 198), ('imag', 189), ('vision', 187), ('perform', 178), ('model', 177), ('detect', 159), ('learn', 158), ('system', 157)]
Bigrams: [('comput vision', 153), ('neural network', 92), ('deep learn', 84), ('convolut neural', 63), ('propos method', 53), ('machin learn', 42), ('object detect', 31), ('network cnn', 26), ('artifici intellig', 25), ('experiment show', 23)]
Trigrams: [('convolut neural network', 63), ('neural network cnn', 26), ('deep learn model', 16), ('comput vision techniqu', 15), ('comput vision cv', 12), ('deep learn method', 11), ('unman aerial vehicl', 11), ('deep convolut neural', 11), ('comput vision system', 11), ('comput vision method', 10)]

The most frequent unigrams and bigrams for cluster 1.
Unigrams: [('databas', 165), ('relat', 137), ('data', 133), ('system', 97), ('propos', 75), ('queri', 75), ('inform', 71), ('approach', 69), ('model', 64), ('process',

### Topics for each cluster based on baseline results
##### Cluster 0: Compiling, paper and computing (unclear)
##### Cluster 1: Computer vision and neural networks
##### Cluster 2: Security, cryptography, and encryption
##### Cluster 3: Internet of Things (IoT) and data security
##### Cluster 4: Programming languages (unclear)
##### Cluster 5: Robotics (unclear)
##### Cluster 6: Databases and information systems, specifcially relational databases

### Topics for each cluster based on improved results
##### Cluster 0: Computer vision and neural networks
##### Cluster 1: Databases and information systems, specifically relational databases
##### Cluster 2: Quantum computing and cryptography
##### Cluster 3: Robotics and machine learning
##### Cluster 4: Programming languages and coding
##### Cluster 5: Internet of things (IoT) and security/encryption