In [1]:
# coding: utf-8
# author Zhang Jun 2021 (with some later modfications)
# Example how to preprocess raw text data. This is not complete/optimized,
# but you can you use the code as a skeleton for your own program.

# To be able to run this code, make sure you have the packages NLTK and scikit-learn installed.
# If not, you can install them with pip via command "pip3 install scikit-learn nltk"
# Load the required packages that are used in this example.

# Edit / Kerkko: added nltk downloads to run script. Our text list is now the list of titles and abstracts.

In [9]:
import re
import string

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd

import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iliaz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iliaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Load raw data
data_path = 'scopusabstracts.txt'

reader = open(data_path, 'r', encoding='utf-8')
lines = reader.readlines()

# Extract the text (title + abstract) from each line
titles = [i.split('#')[1] for i in lines[1:]]
texts = [i.split('#')[2] for i in lines[1:]]

# removing foreign language titles
titles = [re.sub(r'\[+.+]+$', '', title.strip()) for title in titles]

# concatenating title to abstract
texts = [titles[i] + ' ' + texts[i] for i in range(len(titles))]

# uncomment this line and run the file again to get TF-IDF and clustering on just the titles
# texts = titles  # un

# some examples
print('First documents:')
for i in texts[:10]:
    print(i)
print()

First documents:
Anomaly detection in wide area imagery  This study is about detecting anomalies in wide area imagery collected from an aircraft. The set of anomalies have been identified as anything out of the normal course of action. For this purpose, two different data sets were used and the experiments were carried out on these data sets. For anomaly detection, a convolutional neural network model that tries to generate the next image using past images is designed. The images were pre-processed before being given to the model. Anomaly detection is performed by comparing the estimated image and the true image. 

Person re-identification with deep kronecker-product matching and group-shuffling random walk Person re-identification (re-ID) aims to robustly measure visual affinities between person images. It has wide applications in intelligent surveillance by associating same persons' images across multiple cameras. It is generally treated as an image retrieval problem: Given a probe p

In [31]:
# Preprocessing

# Step 1: tokenization and lowercasing
tokens_list = [word_tokenize(i) for i in texts]

lc_tokens_list = []
for i in tokens_list:
    lc_tokens_list.append([token.lower() for token in i])

print('After tokenization and lowercasing:')
for i in lc_tokens_list[:10]:
    print(i)
print()

# original number of tokens
uniques = np.unique([tok for doc in lc_tokens_list for tok in doc])
print("Original number of tokens: {}\n".format(len(uniques)))

# Steps 2 and 3: remove stop words and punctuation
stop_words = set(stopwords.words('english'))
print('NLTK stopwords:')
print(stop_words)
print()

# Here we include the punctuation in the stop words set. There are alternative
# ways to remove punctuation.
stop_words.update(string.punctuation)
stop_words.add("...")

# you can check updated stopwords
# print(stop_words)

filtered_sentence = []
for i in lc_tokens_list:
    filtered_sentence.append([token for token in i if token not in stop_words])

# Numbers are also removed
filtered_sentence = [' '.join(i) for i in filtered_sentence]
filtered_sentence = [re.sub(r'\d+', '', sentence) for sentence in filtered_sentence]

# number of tokens
uniques = np.unique([tok for doc in filtered_sentence for tok in doc.split()])
print("Number of tokens after stopword and punctuation removal: {}\n".format(len(uniques)))

print('After removing stop words, punctuation and numbers:')
for i in filtered_sentence[:10]:
    print(i)
print()

After tokenization and lowercasing:
['anomaly', 'detection', 'in', 'wide', 'area', 'imagery', 'this', 'study', 'is', 'about', 'detecting', 'anomalies', 'in', 'wide', 'area', 'imagery', 'collected', 'from', 'an', 'aircraft', '.', 'the', 'set', 'of', 'anomalies', 'have', 'been', 'identified', 'as', 'anything', 'out', 'of', 'the', 'normal', 'course', 'of', 'action', '.', 'for', 'this', 'purpose', ',', 'two', 'different', 'data', 'sets', 'were', 'used', 'and', 'the', 'experiments', 'were', 'carried', 'out', 'on', 'these', 'data', 'sets', '.', 'for', 'anomaly', 'detection', ',', 'a', 'convolutional', 'neural', 'network', 'model', 'that', 'tries', 'to', 'generate', 'the', 'next', 'image', 'using', 'past', 'images', 'is', 'designed', '.', 'the', 'images', 'were', 'pre-processed', 'before', 'being', 'given', 'to', 'the', 'model', '.', 'anomaly', 'detection', 'is', 'performed', 'by', 'comparing', 'the', 'estimated', 'image', 'and', 'the', 'true', 'image', '.']
['person', 're-identification', 'w

In [32]:
# Step 4: stemming
porter = PorterStemmer()
# or snowball stemmer
# stemmer = SnowballStemmer("english",ignore_stopwords=True)
stemmed_tokens_list = []

for i in filtered_sentence:
    stemmed_tokens_list.append([porter.stem(j) for j in i.split()])

# number of tokens
uniques = np.unique([tok for doc in stemmed_tokens_list for tok in doc])
print("Number of tokens after stemming: {}\n".format(len(uniques)))

print('After stemming:')
for i in stemmed_tokens_list[:10]:
    for j in i:
        print(j, end=" ")
    print(" ")

Number of tokens after stemming: 10191

After stemming:
anomali detect wide area imageri studi detect anomali wide area imageri collect aircraft set anomali identifi anyth normal cours action purpos two differ data set use experi carri data set anomali detect convolut neural network model tri gener next imag use past imag design imag pre-process given model anomali detect perform compar estim imag true imag  
person re-identif deep kronecker-product match group-shuffl random walk person re-identif re-id aim robustli measur visual affin person imag wide applic intellig surveil associ person imag across multipl camera gener treat imag retriev problem given probe person imag affin probe imag galleri imag pg affin use rank retriev galleri imag exist two main challeng effect solv problem person imag usual show signific variat differ person pose view angl spatial layout correspond person imag therefor vital inform tackl problem state-of-the-art method either ignor spatial variat util extra p

In [33]:
# 5. Check most frequent words - candidates to add to the stopword list
listofall = [item for elem in stemmed_tokens_list for item in elem]

freq = FreqDist(listofall)
wnum = freq.B()
print("\nMost common words (total %d)" % wnum)
print(freq.most_common(100))


Most common words (total 10191)
[('use', 1793), ('data', 1238), ('system', 1208), ('propos', 1082), ('model', 937), ('method', 880), ('comput', 868), ('robot', 806), ('imag', 792), ('perform', 774), ('base', 728), ('algorithm', 719), ('databas', 701), ('result', 684), ('secur', 665), ('paper', 635), ('approach', 621), ('compil', 602), ('applic', 593), ('design', 569), ('gener', 548), ('learn', 543), ('develop', 535), ('detect', 513), ('process', 512), ('.', 512), ('inform', 507), ('network', 505), ('present', 504), ('implement', 481), ('differ', 470), ('improv', 445), ('relat', 445), ('provid', 441), ('show', 437), ('optim', 437), ('techniqu', 430), ('time', 428), ('studi', 417), ('program', 417), ('evalu', 386), ('also', 385), ('effici', 380), ('work', 374), ('problem', 366), ('analysi', 365), ('object', 361), ('scheme', 358), ('research', 348), ('new', 348), ('featur', 347), ('control', 337), ('key', 336), ('structur', 333), ('compar', 332), ('requir', 331), ('vision', 327), ('two',

In [34]:
# 6. Present as tf-idf
cleaned_documents = [' '.join(i) for i in stemmed_tokens_list]

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  # unigrams and bigrams
                                   norm='l2',  # default
                                   smooth_idf=False,  # was in the given preproccessing.py
                                   max_df=0.9,
                                   max_features=15000
                                   )
# only tf part:
# tfidf_vectorizer = TfidfVectorizer(use_idf=False)

tfidf_vectorizer.fit(cleaned_documents)
tf_idf_vectors = tfidf_vectorizer.transform(cleaned_documents)

print("\nThe tf-idf values of the first document\n")
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_index = tf_idf_vectors[3, :].nonzero()[1]
tfidf_scores = list(zip(feature_index, [tf_idf_vectors[3, x] for x in feature_index]))
tfidf_scores.sort(key=lambda t: t[1]) 
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

print()
# doesn't match the freq.B() before!! here it is 8k vs. 10k before. Not sure why
print(len(feature_names), 'unique features')

df = pd.DataFrame(list(tf_idf_vectors.toarray()))
df.columns = feature_names
df['_stemmed'] = cleaned_documents
df['_title'] = titles  # with underscore to avoid coincidence with a 'title' feature

# 7*. export data
df.to_csv('data.csv', index=False)

print('\ndata exported to data.csv')


The tf-idf values of the first document

use 0.018523277014819366
paper 0.024194995885516137
method 0.02762676250722815
differ 0.030634027351767523
also 0.030932104969452822
howev 0.032481823161295754
one 0.03406916359899369
new 0.03473757250710102
demonstr 0.036056357098364224
compil 0.03670130394643328
import 0.0390793750915981
control 0.03964266735318102
mani 0.0397250946264875
introduc 0.039891477198720465
sever 0.04120947950484785
solut 0.041871631766058376
measur 0.04246629369463887
area 0.04319494775078536
automat 0.04374064242487513
may 0.04466249346927493
execut 0.04539777492519779
possibl 0.04578102778030854
find 0.04685974871770827
becom 0.048186407581572735
help 0.04865932688305111
base 0.04923899532537756
system 0.05000446437729055
call 0.050008133356129594
toward 0.05036753644639746
year 0.05055089733168141
exampl 0.05111644117077749
comput 0.052807365091497076
run 0.05365078671383848
cloud 0.05365078671383848
amount 0.055122445607142107
shown 0.0559232841806597
benefit 