# Loading the data set and import necessary libraries

In [None]:
# importing necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.stem.porter import *
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.cluster import KMeans


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# downloading the data
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download paultimothymooney/cvpr-2019-papers/CVPR2019/abstracts
! unzip cvpr-2019-papers.zip
! rm -r CVPR2019/papers
! rm -r cvpr-2019-papers.zip
! rm -r cvpr2019
! ls CVPR2019

In [None]:
# loading the data and splitting into train and test
abs_dir = 'CVPR2019/abstracts/'

data = os.listdir(abs_dir)
train_files, test_files = train_test_split(data, test_size=0.2, random_state=1)

# Performing the vectorization process  

In [None]:
# reading text from files
stop = stopwords.words('english')
train_txt, test_txt = [], []

for doc_name in train_files:
    with open(abs_dir + doc_name) as file:
        doc = ""
        for line in file.readlines():
          for word in line.split()[:-1]:
            if word not in stop:
              doc = doc + ' ' + word.lower()
        train_txt.append(doc)


for test_name in test_files:
    with open(abs_dir + test_name) as file:
        query = ""
        for line in file.readlines():
          for word in line.split()[:-1]:
            if word not in stop:
              query = query + ' '+ word.lower()
        test_txt.append(query.strip())

# Computing TF and TF-IDF factors 

In [None]:
# vectrorization
vectorizer = TfidfVectorizer()

# calculating tf-idf values
doc_tfidfs = vectorizer.fit_transform(train_txt).toarray()
query_vecs = vectorizer.transform(test_txt).toarray()

# Rank documents based on cosine similarity
cos_sim = cosine_similarity(query_vecs, doc_tfidfs)
rankings = np.flip(cos_sim.argsort(), axis=1)

print(cos_sim)
print(rankings)

[[0.05133055 0.06431965 0.04127186 ... 0.02359016 0.02970632 0.02256956]
 [0.08876582 0.08910752 0.01787912 ... 0.04171894 0.06443052 0.01945204]
 [0.06526623 0.03643963 0.02032667 ... 0.02998894 0.02173858 0.01973819]
 ...
 [0.08294365 0.01124988 0.02754354 ... 0.01861755 0.08112373 0.02089505]
 [0.02190253 0.00780802 0.03914401 ... 0.03890883 0.04551046 0.02088216]
 [0.0115889  0.00982963 0.00496501 ... 0.01828689 0.01292194 0.052333  ]]
[[670 459 188 ... 539  52 833]
 [636 640 388 ... 614  75 833]
 [272 460 421 ... 694 258 833]
 ...
 [132 593 448 ... 890 753 833]
 [801 489 238 ...  42  32 833]
 [107  78  47 ...  19 548 833]]


# text classification by using Rocchio algorithm

In [None]:
# Rocchio 
#(Below is a param set)
alpha = 1
beta = 0.75
gamma = 0.15
rel_count = 5   # Use top-5 relevant documents to update query vector.
nrel_count = 1  # Use only the most non-relevant document to update query vector.
iters = 5
for _ in range(iters):
    
    # Update query vectors with Rocchio algorithm
    rel_vecs = doc_tfidfs[rankings[:, :rel_count]].mean(axis=1)
    nrel_vecs = doc_tfidfs[rankings[:, -nrel_count:]].mean(axis=1)
    query_vecs = alpha * query_vecs + beta * rel_vecs - gamma * nrel_vecs
    
    # Rerank documents based on cosine similarity
    cos_sim = cosine_similarity(query_vecs, doc_tfidfs)
    rankings = np.flip(cos_sim.argsort(axis=1), axis=1)

print(cos_sim)
print(rankings)

[[0.10115727 0.08675012 0.06462676 ... 0.0507675  0.05787577 0.04101204]
 [0.07681489 0.09306108 0.04355687 ... 0.045645   0.06219576 0.0352161 ]
 [0.07943071 0.03727308 0.03310668 ... 0.04253915 0.03153237 0.0200157 ]
 ...
 [0.15226483 0.03140234 0.04937466 ... 0.03947287 0.10457395 0.03472491]
 [0.08983933 0.04418926 0.07559159 ... 0.05230113 0.08816972 0.04721661]
 [0.04235981 0.03230935 0.01980204 ... 0.0334429  0.05017432 0.0354665 ]]
[[188 670 359 ... 352 828 833]
 [640 636 946 ... 588 234 833]
 [272 238 460 ... 443 917 833]
 ...
 [132 593 229 ... 783 149 833]
 [801 713 994 ...  32 757 833]
 [107  78 372 ... 605 234 833]]


In [None]:
# retrieving the list of related documents from train for given test query
query_result_df = pd.DataFrame(columns=['Query','RetrievedDocuments'])
for query_name, ranking in zip(test_files, rankings):
  ranked_docs = ' '.join([train_files[idx] for idx in ranking])
  query_result_df.loc[len(query_result_df.index)]= [query_name, ranked_docs]

print(query_result_df.head())

                                               Query                                 RetrievedDocuments
0  Bapat_The_Domain_Transform_Solver_CVPR_2019_pa...  He_ODE-Inspired_Network_Design_for_Single_Imag...
1  Porzi_Seamless_Scene_Segmentation_CVPR_2019_pa...  Kirillov_Panoptic_Feature_Pyramid_Networks_CVP...
2  Li_Fully_Quantized_Network_for_Object_Detectio...  Yang_Quantization_Networks_CVPR_2019_paper.txt...
3  Tonioni_Real-Time_Self-Adaptive_Deep_Stereo_CV...  Tonioni_Learning_to_Adapt_for_Stereo_CVPR_2019...
4  Hou_3D-SIS_3D_Semantic_Instance_Segmentation_o...  Chen_Unsupervised_3D_Pose_Estimation_With_Geom...


# Performing the stemming process on the data set

In [None]:
# Porter stemmer function
porterStemmer = PorterStemmer()
def getStemmerSentnce(sentence):
  wordList = nltk.word_tokenize(sentence)
  stemWords = [porterStemmer.stem(word) for word in wordList]
  return ' '.join(stemWords)

In [None]:
# performing stemming on the data set
stemm_train_txt, stemm_test_txt = [], []

for doc_name in train_files:
    with open(abs_dir + doc_name) as file:
        doc = ' '.join([word for line in file.readlines() for word in line.split()[:-1]])
        stemmed_doc = getStemmerSentnce(doc)
        # print (stemmed_doc)
        stemm_train_txt.append(stemmed_doc)

for test_name in test_files:
    with open(abs_dir + test_name) as file:
        query = ' '.join([word for line in file.readlines() for word in line.split()[:-1]])
        stemmed_query = getStemmerSentnce(query)
        stemm_test_txt.append(stemmed_query)

In [None]:
# calculationg TF-IDF values for stemmed text
stemm_doc_tfidfs = vectorizer.fit_transform(stemm_train_txt).toarray()
stemm_query_vecs = vectorizer.transform(stemm_test_txt).toarray()

# Rank documents based on cosine similarity
stemm_cos_sim = cosine_similarity(stemm_query_vecs, stemm_doc_tfidfs)
stemm_rankings = np.flip(cos_sim.argsort(), axis=1)


print(stemm_cos_sim)
print(stemm_rankings)
print(stemm_cos_sim.shape)
print(stemm_rankings.shape)

[[0.16052659 0.13453684 0.15129964 ... 0.10705927 0.08970227 0.1104427 ]
 [0.16481207 0.14620062 0.07664601 ... 0.09347423 0.12447309 0.08357977]
 [0.13145318 0.07648278 0.0733354  ... 0.10526678 0.07793651 0.07963164]
 ...
 [0.17375719 0.08579837 0.09548839 ... 0.08326577 0.12795391 0.08188284]
 [0.1690227  0.07629702 0.18487406 ... 0.15675256 0.13781531 0.12624728]
 [0.04340912 0.03636528 0.0423178  ... 0.05128383 0.04140397 0.07638388]]
[[188 670 359 ... 352 828 833]
 [640 636 946 ... 588 234 833]
 [272 238 460 ... 443 917 833]
 ...
 [132 593 229 ... 783 149 833]
 [801 713 994 ...  32 757 833]
 [107  78 372 ... 605 234 833]]
(259, 1034)
(259, 1034)


In [None]:
## Rocchio algorithm on stemmed data
for _ in range(iters):    
    # Update query vectors with Rocchio algorithm
    rel_vecs = stemm_doc_tfidfs[rankings[:, :rel_count]].mean(axis=1)
    nrel_vecs = stemm_doc_tfidfs[rankings[:, -nrel_count:]].mean(axis=1)
    stemm_query_vecs = alpha * stemm_query_vecs + beta * rel_vecs - gamma * nrel_vecs
    
    # Rerank documents based on cosine similarity
    stemm_cos_sim = cosine_similarity(stemm_query_vecs, stemm_doc_tfidfs)
    stemm_rankings = np.flip(cos_sim.argsort(axis=1), axis=1)

print(stemm_cos_sim)
print(stemm_rankings)

[[0.23966885 0.18155382 0.21478273 ... 0.1729929  0.15181651 0.15968309]
 [0.18486575 0.17742176 0.14930197 ... 0.13598976 0.15594939 0.12937837]
 [0.1762827  0.10821883 0.12593733 ... 0.14113646 0.10310372 0.10609686]
 ...
 [0.26361559 0.14243541 0.17866429 ... 0.15226703 0.20476739 0.14971517]
 [0.22796745 0.13987281 0.21959964 ... 0.1841974  0.17819565 0.17289597]
 [0.14169326 0.09795802 0.11704202 ... 0.11502973 0.13039205 0.11320644]]
[[188 670 359 ... 352 828 833]
 [640 636 946 ... 588 234 833]
 [272 238 460 ... 443 917 833]
 ...
 [132 593 229 ... 783 149 833]
 [801 713 994 ...  32 757 833]
 [107  78 372 ... 605 234 833]]


# Comparing the performance of Rocchio algorithm with KMeans algorithm

In [None]:
# vectorization
vectorizer = TfidfVectorizer()
doc_tfidfs = vectorizer.fit_transform(train_txt).toarray()
query_vecs = vectorizer.transform(test_txt).toarray()

# KMeans algorithm
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(doc_tfidfs)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [None]:
# printing clusters and its features
for i in range(true_k):
 print("\nCluster %d:" % i),
 for ind in order_centroids[i, :10]:
  print('%s'% terms[ind])


Cluster 0:
object
detection
segmentation
network
the
we
image
learning
networks
accuracy

Cluster 1:
context
features
graph
feature
network
convolution
channel
information
semantic
module

Cluster 2:
domain
learning
classes
data
adaptation
image
target
semantic
training
shot

Cluster 3:
adversarial
attacks
attack
defense
image
perturbations
examples
gradient
method
norm

Cluster 4:
video
temporal
action
frame
videos
motion
frames
model
flow
the

Cluster 5:
resolution
imaging
light
blur
image
high
images
low
nlos
super

Cluster 6:
3d
pose
2d
shape
we
face
object
estimation
model
the

Cluster 7:
visual
image
attention
question
task
vqa
dialog
we
language
models

Cluster 8:
depth
stereo
camera
view
method
image
estimation
scene
data
we

Cluster 9:
point
local
cloud
3d
registration
clouds
density
the
matching
points


In [None]:
print("Prediction")
predicted = model.predict(query_vecs)
print(predicted)

Prediction
[8 0 0 8 6 4 4 6 2 2 2 8 7 1 7 0 5 5 0 9 9 2 0 0 0 8 2 7 0 0 1 6 0 0 0 4 4
 3 6 7 0 2 1 3 0 2 7 6 2 4 4 1 4 0 4 0 2 8 1 2 2 5 6 1 0 8 0 0 0 6 9 4 0 4
 2 3 2 6 2 2 2 9 6 9 7 9 0 4 1 0 4 0 7 8 4 0 7 0 7 0 9 0 0 2 4 0 6 0 2 0 2
 6 7 1 1 4 9 0 0 1 8 9 8 0 2 0 2 0 0 0 0 5 0 0 4 8 5 2 2 6 0 2 0 4 0 0 9 0
 1 7 5 7 7 2 7 0 9 4 1 6 7 2 0 7 0 3 6 0 0 2 0 7 0 0 7 7 1 0 4 7 6 7 4 9 0
 4 0 2 2 6 6 0 1 0 2 0 4 2 0 0 8 2 4 6 0 4 0 8 6 2 0 4 4 3 0 2 0 1 3 0 1 0
 0 0 4 2 5 8 4 1 0 5 8 8 0 0 7 7 2 2 4 0 4 4 2 0 0 0 0 0 8 6 0 0 2 7 7 0 7]
