In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras import layers
import ast
from collections import Counter

In [2]:
df = pd.read_csv("E:/NITD/ML_Project/xtract-api/DataSet/arxiv_processed.csv")
df.head()

Unnamed: 0,id,title,abstract,authors,category_code,update_date,clean_title,clean_abstract,category
0,acc-phys/9607001,An Investigation of Stochastic Cooling in the ...,This report provides a description of unbunc...,O. Meincke,acc-phys physics.acc-ph,2008-02-03,an investigation of stochastic cooling in the ...,this report provides a description of unbunche...,"Accelerator Physics, Physics – Accelerator Phy..."
1,acc-phys/9601001,Particle Motion in the Stable Region Near an E...,This paper studies the particle motion when ...,G. Parzen (Brookhaven National Laboratory),acc-phys physics.acc-ph,2008-02-03,particle motion in the stable region near an e...,this paper studies the particle motion when th...,"Accelerator Physics, Physics – Accelerator Phy..."
2,acc-phys/9602001,Muon Colliders,Muon Colliders have unique technical and phy...,"R. B. Palmer(BNL), A. Sessler(LBNL), A. Skrins...",acc-phys physics.acc-ph,2012-08-29,muon colliders,muon colliders have unique technical and physi...,"Accelerator Physics, Physics – Accelerator Phy..."
3,adap-org/9306005,Prediction and Adaptation in an Evolving Chaot...,We describe the results of analytic calculat...,"Alfred H\""ubler and David Pines (Santa Fe Inst...",adap-org chao-dyn nlin.AO nlin.CD,2008-02-03,prediction and adaptation in an evolving chaot...,we describe the results of analytic calculatio...,"Adaptation, Noise, and Self-Organizing Systems..."
4,chao-dyn/9407001,Pattern Dynamics of a Coupled Map Lattice for ...,The pattern dynamics of the one-way coupled ...,Frederick H. Willeboordse (University of Tokyo...,chao-dyn adap-org nlin.AO nlin.CD nlin.PS patt...,2015-06-24,pattern dynamics of a coupled map lattice for ...,the pattern dynamics of the one way coupled lo...,"Adaptation, Noise, and Self-Organizing Systems..."


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194065 entries, 0 to 194064
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              194065 non-null  object
 1   title           194065 non-null  object
 2   abstract        194065 non-null  object
 3   authors         194065 non-null  object
 4   category_code   194065 non-null  object
 5   update_date     194065 non-null  object
 6   clean_title     194065 non-null  object
 7   clean_abstract  194065 non-null  object
 8   category        194065 non-null  object
 9   text            194065 non-null  object
dtypes: object(10)
memory usage: 14.8+ MB


In [3]:
df['clean_title'] = df['clean_title'].fillna('').astype(str)
df['clean_abstract'] = df['clean_abstract'].fillna('').astype(str)
df['title'] = df['title'].fillna('').astype(str)

df['text'] = df['clean_title'] + " " + df['clean_abstract']
df.head()

Unnamed: 0,id,title,abstract,authors,category_code,update_date,clean_title,clean_abstract,category,text
0,acc-phys/9607001,An Investigation of Stochastic Cooling in the ...,This report provides a description of unbunc...,O. Meincke,acc-phys physics.acc-ph,2008-02-03,an investigation of stochastic cooling in the ...,this report provides a description of unbunche...,"Accelerator Physics, Physics – Accelerator Phy...",an investigation of stochastic cooling in the ...
1,acc-phys/9601001,Particle Motion in the Stable Region Near an E...,This paper studies the particle motion when ...,G. Parzen (Brookhaven National Laboratory),acc-phys physics.acc-ph,2008-02-03,particle motion in the stable region near an e...,this paper studies the particle motion when th...,"Accelerator Physics, Physics – Accelerator Phy...",particle motion in the stable region near an e...
2,acc-phys/9602001,Muon Colliders,Muon Colliders have unique technical and phy...,"R. B. Palmer(BNL), A. Sessler(LBNL), A. Skrins...",acc-phys physics.acc-ph,2012-08-29,muon colliders,muon colliders have unique technical and physi...,"Accelerator Physics, Physics – Accelerator Phy...",muon colliders muon colliders have unique tech...
3,adap-org/9306005,Prediction and Adaptation in an Evolving Chaot...,We describe the results of analytic calculat...,"Alfred H\""ubler and David Pines (Santa Fe Inst...",adap-org chao-dyn nlin.AO nlin.CD,2008-02-03,prediction and adaptation in an evolving chaot...,we describe the results of analytic calculatio...,"Adaptation, Noise, and Self-Organizing Systems...",prediction and adaptation in an evolving chaot...
4,chao-dyn/9407001,Pattern Dynamics of a Coupled Map Lattice for ...,The pattern dynamics of the one-way coupled ...,Frederick H. Willeboordse (University of Tokyo...,chao-dyn adap-org nlin.AO nlin.CD nlin.PS patt...,2015-06-24,pattern dynamics of a coupled map lattice for ...,the pattern dynamics of the one way coupled lo...,"Adaptation, Noise, and Self-Organizing Systems...",pattern dynamics of a coupled map lattice for ...


In [53]:
df['text'] = df['text'].apply(lambda x: str(x))
df['title'] = df['title'].apply(lambda x: str(x))

In [54]:
data = tf.data.Dataset.from_tensor_slices({
    "text": df['text'].values,
    "title": df['title'].values
})

data = data.shuffle(10000).batch(128)

In [55]:
max_tokens = 8500
embedding_dim = 64
vectorize_layer = layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=270)
vectorize_layer.adapt(df['text'].values)

def build_text_encoder():
    return tf.keras.Sequential([
        vectorize_layer,
        layers.Embedding(max_tokens, embedding_dim),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64)
    ])


In [56]:
class PaperModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        self.query_model = build_text_encoder()
        self.candidate_model = build_text_encoder()

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=data.map(lambda x: x["text"])
            )
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model(features["text"])
        candidate_embeddings = self.candidate_model(features["text"])
        return self.task(query_embeddings, candidate_embeddings)

In [57]:
model = PaperModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(data, epochs=3)

ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 