In [1]:
import re
import collections
import logging
import multiprocessing

import faiss
import gensim
import gensim.downloader
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from datasets import load_dataset
from gensim.models import Word2Vec,KeyedVectors
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
dataset=pd.read_csv("arxiv_reduced.csv")

In [3]:
dataset.head()

Unnamed: 0,id,title,abstract,categories,desired_categories
0,704.0086,Clustering in a stochastic model of one-dimens...,We give a quantitative analysis of clusterin...,['math.PR'],True
1,704.0215,The exact asymptotic of the collision time tai...,In this note we consider the time of the col...,['math.PR'],True
2,704.038,Exponential growth rates in a typed branching ...,We study the high temperature phase of a fam...,['math.PR'],True
3,704.0398,Renewals for exponentially increasing lifetime...,We show that the number of renewals up to ti...,['math.PR'],True
4,704.0405,An invariance principle for semimartingale ref...,Semimartingale reflecting Brownian motions (...,['math.PR'],True


In [4]:
dataset.shape

(23025, 5)

### Preprocessing

In [5]:
def tokenize_punc(text):
    text=re.findall(r"([,.!;\-\(\)]|\b[\w'.]+\b)",text)
    return text
print(tokenize_punc("My name is Ismail, I love data science."))

['My', 'name', 'is', 'Ismail', ',', 'I', 'love', 'data', 'science', '.']


In [6]:
def clean_apostrophes(text):
    text = text.replace("'", '')
    text = text.replace("`", '')
    text = text.replace("“", '')
    text = text.replace("”", '')
    return text
print(clean_apostrophes("My favorite food in this cook book is 'Rissotto'. "))

My favorite food in this cook book is Rissotto. 


In [10]:
def clean_punc(text,keep_stop=True):
    
    if not keep_stop:
        text=re.sub(r"[^\w\s]","",text)
    else:
        text=re.sub(r"[^\w\s!?.]","",text)
    return text.strip()
print(clean_punc("My name is Ismail, I love data science."))

My name is Ismail I love data science.


In [14]:
def preprocess(text):
    
    text=tokenize_punc(text)
    text=list(map(lambda x:clean_apostrophes(x),text))
    text=list(map(lambda x:clean_punc(x),text))
    text=list(map(lambda x:x.lower(),text))
    
    text=" ".join(text)
    text=text.strip()
    return text

In [15]:
print(preprocess("My name is Ismail, I love data science."))

my name is ismail  i love data science .


In [17]:
dataset["title_pp"]=dataset["title"].progress_apply(lambda x: preprocess(x))
dataset["abstract_pp"]=dataset["abstract"].progress_apply(lambda x: preprocess(x))

  0%|          | 0/23025 [00:00<?, ?it/s]

  0%|          | 0/23025 [00:00<?, ?it/s]

In [19]:
ncores=multiprocessing.cpu_count()


In [20]:
dataset.head()

Unnamed: 0,id,title,abstract,categories,desired_categories,title_pp,abstract_pp
0,704.0086,Clustering in a stochastic model of one-dimens...,We give a quantitative analysis of clusterin...,['math.PR'],True,clustering in a stochastic model of one dimen...,we give a quantitative analysis of clustering ...
1,704.0215,The exact asymptotic of the collision time tai...,In this note we consider the time of the col...,['math.PR'],True,the exact asymptotic of the collision time tai...,in this note we consider the time of the colli...
2,704.038,Exponential growth rates in a typed branching ...,We study the high temperature phase of a fam...,['math.PR'],True,exponential growth rates in a typed branching ...,we study the high temperature phase of a famil...
3,704.0398,Renewals for exponentially increasing lifetime...,We show that the number of renewals up to ti...,['math.PR'],True,renewals for exponentially increasing lifetime...,we show that the number of renewals up to time...
4,704.0405,An invariance principle for semimartingale ref...,Semimartingale reflecting Brownian motions (...,['math.PR'],True,an invariance principle for semimartingale ref...,semimartingale reflecting brownian motions sr...


### Preparing for word2vec

In [22]:
title_vocab=dataset["title_pp"].tolist()
title_vocab=[token for line in title_vocab for token in line.split()]

abstract_vocab=dataset["title_pp"].tolist()
abstract_vocab=[token for line in abstract_vocab for token in line.split()]

title_vocab=collections.Counter(title_vocab)
abstract_vocab=collections.Counter(abstract_vocab)

vocab=title_vocab+abstract_vocab
vocab=dict(sorted(vocab.items(),key=lambda x:x[1],reverse=True))



In [27]:
vocab

{'of': 25604,
 'the': 16902,
 'for': 16396,
 'and': 12728,
 'a': 11136,
 'on': 9310,
 'random': 8886,
 'with': 8658,
 'in': 8344,
 'processes': 5850,
 'stochastic': 4684,
 'to': 3684,
 'time': 2746,
 'brownian': 2724,
 'equations': 2528,
 'model': 2358,
 'limit': 2332,
 'process': 2288,
 'markov': 2102,
 'by': 1966,
 'large': 1916,
 'gaussian': 1816,
 'motion': 1730,
 'an': 1680,
 'convergence': 1680,
 'l': 1670,
 'walks': 1626,
 'models': 1600,
 'differential': 1596,
 'non': 1564,
 'dimensional': 1490,
 'walk': 1478,
 'branching': 1436,
 'theorem': 1404,
 'fractional': 1344,
 'percolation': 1262,
 'local': 1188,
 'two': 1152,
 'distribution': 1148,
 'equation': 1136,
 'driven': 1116,
 'type': 1112,
 'deviations': 1098,
 'evy': 1098,
 'approximation': 1094,
 'noise': 1064,
 'stable': 1052,
 'times': 1048,
 'asymptotic': 1040,
 'diffusion': 1024,
 'applications': 1020,
 'distributions': 1018,
 'theorems': 1012,
 'graphs': 998,
 'measures': 998,
 'stationary': 974,
 'matrices': 966,
 'po

In [29]:
def convert_data(dataset,path):
    titles=dataset["title_pp"].tolist()
    abstracts=dataset["abstract_pp"].tolist()
    
    titles_and_abstracts=list(zip(titles,abstracts))
    titles_and_abstracts=list(map(lambda x: x[0] + " -- " + x[1],titles_and_abstracts))
    titles_and_abstracts_write="\n\n".join(titles_and_abstracts)
    
    with open(path,"w",encoding="utf-8") as f:
        f.write(titles_and_abstracts_write)
    titles_and_abstracts=[t_a.split() for t_a in titles_and_abstracts]
    return titles_and_abstracts

In [30]:
titles_and_abstracts=convert_data(dataset,"word2vec_train.txt")

In [31]:
model = Word2Vec(
    sentences=titles_and_abstracts,
    vector_size=300,
    window=5,
    min_count=1,
    workers=12
)

### Search

In [41]:
def get_sentence_vector(model,sentence,k=True):
    if k:
        sentence=preprocess(sentence)
    sentence_=sentence.split()
    embedding_list=[]
    for token in sentence_:
        try:
            vec=model.wv[token]
        except KeyError:
            vec=np.zeros(model.vector_size)
        embedding_list.append(vec)
    embedding_pooled=np.mean(embedding_list,axis=0)
    return embedding_pooled



In [44]:
print(get_sentence_vector(model,"We give a quantitative analysis of clusterin"))

[ 1.73760873e-01 -5.20296635e-01 -5.01851305e-01  7.62987988e-02
 -1.91133150e-01 -6.26178490e-01  4.06711838e-01  4.93630586e-01
 -3.13415440e-01 -2.03620845e-02  2.78329653e-01  1.72463433e-01
  7.50194107e-02 -6.36856481e-02  2.60954084e-02  3.43899305e-02
  3.81364284e-01 -1.85809734e-01  2.33950518e-01  6.32488963e-01
 -7.49104257e-02  3.98844076e-01  6.29321169e-01 -9.13951368e-02
  4.10113147e-01 -5.49959847e-01 -1.03013090e-02  6.50639193e-02
  1.73342230e-01 -9.80595840e-02 -8.56563284e-02  2.73359822e-01
 -1.31905183e-01  2.55335621e-01  1.13643527e-01  1.52305122e-01
 -5.41629206e-02  1.61376899e-01  4.24914803e-01 -2.70142561e-01
 -1.05499833e-01  6.80382601e-01  3.79985375e-01  1.98395134e-01
 -4.91785645e-01  1.16895034e-01 -1.01073282e+00  1.91197772e-01
 -2.30763623e-01  1.01822323e-01 -3.85627616e-01 -2.16322580e-01
  6.42318257e-02 -5.73720455e-01  2.73620107e-02  2.84298633e-01
  6.76350168e-02 -7.50132799e-02  6.60671858e-01 -1.63650498e-01
  3.01675413e-02  4.77816

In [38]:
def vectorized_dotproduct(query_vector,embeddins,k):
    similarity=(title_and_abstracs_embedding @ query_vector.T).squeeze(1)
    topk=np.argpartition(similarity,-k)[-k:]
    return topk

In [49]:
def faiss_search(embeddings,query,k):
    
    index=faiss.IndexFlatL2(embeddings.shape[1])
    index.add(titles_and_abstracts_embedding)
    query_vector=get_sentence_vector(model,query,False)[:,None].T
    D,I=index.search(query_vector,k)
    
    return I
titles_and_abstracts_embedding=[
    get_sentence_vector(model," ".join(t_a),False)
    for t_a in tqdm(titles_and_abstracts)
]

titles_and_abstracts_embedding=np.array(titles_and_abstracts_embedding)

  0%|          | 0/23025 [00:00<?, ?it/s]

In [50]:
titles_and_abstracts_embedding.shape

(23025, 300)

In [52]:
retrieved_index=faiss_search(titles_and_abstracts_embedding,"multiple input cnn",10)

for idx in retrieved_index[0]:
    print(" ".join(titles_and_abstracts[idx]))
    print("\n\n")

tencent video dataset tvd a video dataset for learning based visual data compression and analysis -- learning based visual data compression and analysis have attracted great interest from both academia and industry recently . more training as well as testing datasets especially good quality video datasets are highly desirable for related research and standardization activities . tencent video dataset tvd is established to serve various purposes such as training neural network based coding tools and testing machine vision tasks including object detection and tracking . tvd contains 86 video sequences with a variety of content coverage . each video sequence consists of 65 frames at 4k 3840x2160 spatial resolution . in this paper the details of this dataset as well as its performance when compressed by vvc and hevc video codecs are introduced .



synthesizing dynamic mri using long term recurrent convolutional networks -- a method is proposed for converting raw ultrasound signals of resp

In [57]:
retrieved_index[0]

array([20029, 15157, 20312, 18535, 16934, 14520, 17536, 20980, 16173,
       18180], dtype=int64)

### Analogy

In [58]:
glove_vectors=gensim.downloader.load("glove-wiki-gigaword-300")



In [59]:
def analogy(glove_vectors,ps1,p1,p2):
    v=glove_vectors[ps1]-glove_vectors[p1] + glove_vectors[p2]
    
    return glove_vectors.most_similar(v)
analogy(glove_vectors,"king","man","queen")

[('queen', 0.8265538811683655),
 ('king', 0.7757720947265625),
 ('monarch', 0.5765542387962341),
 ('elizabeth', 0.549494743347168),
 ('throne', 0.5387357473373413),
 ('princess', 0.5265572667121887),
 ('majesty', 0.5183771252632141),
 ('royal', 0.5162538290023804),
 ('coronation', 0.5072318911552429),
 ('vi', 0.4976198375225067)]

In [62]:
analogy(glove_vectors,"software","developer","food")

[('food', 0.7458261847496033),
 ('products', 0.579909086227417),
 ('software', 0.5337928533554077),
 ('supplies', 0.5279377102851868),
 ('goods', 0.5018188953399658),
 ('computers', 0.4913240373134613),
 ('product', 0.4814090430736542),
 ('equipment', 0.4810752868652344),
 ('programs', 0.4797922968864441),
 ('supply', 0.47966468334198)]

### Classification

In [63]:
df=load_dataset("rotten_tomatoes")

Downloading builder script:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [66]:
train=[]
for sample in df["train"]:
    train.append([preprocess(sample["text"]), sample["label"]])

In [77]:
test=[]
for sample in df["test"]:
    test.append([preprocess(sample["text"]), sample["label"]])

In [68]:
length=max([len(sample[0].split()) for sample in tqdm(train)])

  0%|          | 0/9596 [00:00<?, ?it/s]

In [73]:
def get_sentence_vector(glove_vectors,sentence,flag=False,pool="mean",max_length=64):
    if flag:
        sentence=preprocess(sentence)
    sentence_=sentence.split()
    embedding_list=[]
    for token in sentence_:
        try:
            vec=glove_vectors.get_vector(token)
        except KeyError:
            vec=np.zeros(glove_vectors.vector_size)
        embedding_list.append(vec)
    
    if pool=="mean":
        embedding_pooled=np.mean(embedding_list,axis=0)
    elif pool=="low_rank":
        if len(embedding_list)>max_length:
            embedding_list=embedding_list[:max_length]
        
        elif len(embedding_list)<max_length:
            for _ in range(max_length-len(embedding_list)):
                vec=np.zeros(glove_vectors.vector_size)
                embedding_list=append(vec)
        embedding_pooled=np.concatenate(embedding_list,axis=0)
    return embedding_pooled

In [81]:
pool="mean"
X_train = [get_sentence_vector(glove_vectors, sample[0], pool=pool) for sample in tqdm(train)]
X_train = np.stack(X_train, axis=0)

X_test = [get_sentence_vector(glove_vectors, sample[0], pool=pool) for sample in tqdm(test)]
X_test = np.stack(X_test, axis=0)

y_train = np.array([sample[1] for sample in tqdm(train)])
y_test = np.array([sample[1] for sample in tqdm(test)])

if pool == "mean":
    svd = TruncatedSVD(n_components=50)
    svd.fit(X_train)
    
    X_train = svd.transform(X_train)
    X_test = svd.transform(X_test)

  0%|          | 0/9596 [00:00<?, ?it/s]

  0%|          | 0/1066 [00:00<?, ?it/s]

  0%|          | 0/9596 [00:00<?, ?it/s]

  0%|          | 0/1066 [00:00<?, ?it/s]

In [83]:
model=CatBoostClassifier(verbose=0)
model.fit(X_train,y_train)
preds=model.predict(X_test)

In [84]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91       533
           1       0.93      0.89      0.91       533

    accuracy                           0.91      1066
   macro avg       0.91      0.91      0.91      1066
weighted avg       0.91      0.91      0.91      1066

