In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re 

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Papers.csv')
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [4]:
from nltk.corpus import stopwords
from string import punctuation
import nltk
from nltk.stem import WordNetLemmatizer,SnowballStemmer
from gensim.models import LdaMulticore,TfidfModel,CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
# Keeping just the necessary columns

df = df[['id','title','paper_text']]
df.head()

Unnamed: 0,id,title,paper_text
0,1,Self-Organization of Associative Database and ...,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,A Mean Field Theory of Layer IV of Visual Cort...,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,Storing Covariance by the Associative Long-Ter...,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,Bayesian Query Construction for Neural Network...,Bayesian Query Construction for Neural\nNetwor...
4,1001,"Neural Network Ensembles, Cross Validation, an...","Neural Network Ensembles, Cross\nValidation, a..."


### Preprocessing

In [6]:
df.isnull().sum()

id            0
title         0
paper_text    0
dtype: int64

In [7]:
# Stopwords
stuff_to_be_removed = list(stopwords.words('english')) + list(punctuation)

In [8]:
def preprocess(text):
  text = text.lower()
    
  #Remove links 
  text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                    '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)
  # Remove extra spaces 
  text = re.sub(' +', ' ', text)

  # Remove mentions 
  text = re.sub("(@[A-Za-z0-9_]+)","", text)
    
  # Remove all non alphanumeric characters 
  text = re.sub("^[A-Za-z0-9_-]*$", "", text)
  
  # Remove stopwords and lemmatize
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(token) for token in text.split() if token not in stuff_to_be_removed and len(token)>=4])
  return text.split()

df_clean = df['paper_text'].apply(preprocess)

In [9]:
df_clean.head()

0    [self-organization, associative, database, app...
1    [mean, field, theory, layer, visual, cortex, a...
2    [storing, covariance, associative, long?term, ...
3    [bayesian, query, construction, neural, networ...
4    [neural, network, ensembles,, cross, validatio...
Name: paper_text, dtype: object

In [10]:
dictionary = Dictionary(df_clean)

In [11]:
# Bag of Words
bow_corpus = [dictionary.doc2bow(i) for i in df_clean]

In [12]:
lda_model = LdaMulticore(bow_corpus,num_topics=10,id2word=dictionary,passes=2,workers=2)

In [13]:
for id, score in sorted(lda_model[bow_corpus[4310]],key=lambda tup:-1*tup[1]):
    print(score,'\n',lda_model.print_topic(id,5))

0.85522294 
 0.005*"model" + 0.005*"network" + 0.005*"function" + 0.004*"learning" + 0.004*"result"
0.105095915 
 0.007*"algorithm" + 0.005*"learning" + 0.004*"function" + 0.004*"bound" + 0.004*"policy"
0.033080705 
 0.007*"algorithm" + 0.006*"model" + 0.004*"matrix" + 0.004*"method" + 0.004*"problem"


In [14]:
for id,topic in lda_model.print_topics(-1):
    print(id,'\n', topic)

0 
 0.007*"algorithm" + 0.005*"function" + 0.005*"learning" + 0.004*"data" + 0.004*"model" + 0.004*"using" + 0.004*"number" + 0.003*"result" + 0.003*"error" + 0.003*"network"
1 
 0.006*"model" + 0.005*"algorithm" + 0.004*"function" + 0.004*"method" + 0.004*"learning" + 0.004*"problem" + 0.004*"result" + 0.004*"using" + 0.003*"number" + 0.003*"also"
2 
 0.006*"model" + 0.004*"network" + 0.004*"learning" + 0.004*"using" + 0.004*"algorithm" + 0.004*"neural" + 0.003*"number" + 0.003*"feature" + 0.003*"method" + 0.003*"training"
3 
 0.005*"model" + 0.005*"network" + 0.005*"function" + 0.004*"learning" + 0.004*"result" + 0.004*"method" + 0.004*"neural" + 0.003*"using" + 0.003*"data" + 0.003*"figure"
4 
 0.007*"algorithm" + 0.006*"model" + 0.004*"matrix" + 0.004*"method" + 0.004*"problem" + 0.004*"data" + 0.004*"function" + 0.004*"learning" + 0.003*"using" + 0.003*"distribution"
5 
 0.008*"model" + 0.007*"image" + 0.006*"learning" + 0.005*"algorithm" + 0.004*"using" + 0.004*"method" + 0.004*"

In [15]:
# TF-IDF
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [16]:
lda_model_tfidf = LdaMulticore(corpus_tfidf,num_topics=10,id2word=dictionary,passes=2,workers=2)

In [17]:
for id, topic in lda_model_tfidf.print_topics(-1):
    print(id,'\n', topic)

0 
 0.000*"neuron" + 0.000*"regret" + 0.000*"latent" + 0.000*"bound" + 0.000*"image" + 0.000*"word" + 0.000*"spike" + 0.000*"loss" + 0.000*"network" + 0.000*"variational"
1 
 0.000*"kernel" + 0.000*"neuron" + 0.000*"image" + 0.000*"spike" + 0.000*"network" + 0.000*"node" + 0.000*"graph" + 0.000*"latent" + 0.000*"cluster" + 0.000*"bound"
2 
 0.000*"kernel" + 0.000*"bound" + 0.000*"graph" + 0.000*"node" + 0.000*"network" + 0.000*"theorem" + 0.000*"matrix" + 0.000*"convex" + 0.000*"neuron" + 0.000*"image"
3 
 0.000*"network" + 0.000*"image" + 0.000*"matrix" + 0.000*"bound" + 0.000*"node" + 0.000*"word" + 0.000*"loss" + 0.000*"theorem" + 0.000*"training" + 0.000*"hidden"
4 
 0.000*"policy" + 0.000*"kernel" + 0.000*"matrix" + 0.000*"image" + 0.000*"label" + 0.000*"bound" + 0.000*"node" + 0.000*"agent" + 0.000*"word" + 0.000*"network"
5 
 0.000*"tensor" + 0.000*"submodular" + 0.000*"image" + 0.000*"policy" + 0.000*"kernel" + 0.000*"feature" + 0.000*"neuron" + 0.000*"object" + 0.000*"matrix" 

In [18]:
for id,score in sorted(lda_model_tfidf[corpus_tfidf[4310]],key=lambda tup: -1*tup[1]):
    print(score, '\n', lda_model_tfidf.print_topic(id,5))

0.4892348 
 0.000*"policy" + 0.000*"kernel" + 0.000*"matrix" + 0.000*"image" + 0.000*"label"
0.4718335 
 0.000*"network" + 0.000*"image" + 0.000*"policy" + 0.000*"neuron" + 0.000*"layer"


In [19]:
coherence_model_lda = CoherenceModel(model=lda_model_tfidf,texts=df_clean,dictionary=dictionary,coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

Coherence Score: 0.17965558115372982
