In [None]:
import pandas as pd 
import numpy as np 
from datetime import datetime
import sys
import ast

import plotly_express as px

import re

import matplotlib.pyplot as plt
import seaborn as sns

import json
import dask.bag as db

### Extract Data 

In [None]:
ai_category_list=['astro-ph','astro-ph.GA','astro-ph.CO','astro-ph.EP','astro-ph.HE','astro-ph.IM','astro-ph.SR']
records=db.read_text("../input/arxiv/arxiv-metadata-oai-snapshot.json").map(lambda x:json.loads(x))
ai_docs = (records.filter(lambda x:any(ele in x['categories'] for ele in ai_category_list)==True))
get_metadata = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'],
                  'abstract':x['abstract'],
                 'version':x['versions'][-1]['created'],
                         'doi':x["doi"],
                         'authors_parsed':x['authors_parsed']}

data=ai_docs.map(get_metadata).to_dataframe().compute()

data.to_excel("Astro_ArXiv_Papers.xlsx",index=False,encoding="utf-8")

In [None]:
print("Number of Papers Related to Astrophysics is ",data.shape[0])

### Data Wrangling

In [None]:
data['DateTime']=pd.to_datetime(data['version'])
data.head()

In [None]:
data['Day'] = data['DateTime'].dt.day
data['Month'] = data['DateTime'].dt.month
data['Year'] = data['DateTime'].dt.year

In [None]:
data = data[data['Year'].between(1992,2021)]

In [None]:
# Concatenate the author first and last names
data['num_authors']=data['authors_parsed'].apply(lambda x:len(x))

data['authors']=data['authors_parsed'].apply(lambda authors:[(" ".join(author)).strip() for author in authors])
data.head()

In [None]:
papers_over_years=data.groupby(['Year']).size().reset_index().rename(columns={0:'Number Of Papers Published'})
px.line(x="Year",y="Number Of Papers Published",data_frame=papers_over_years,title="Growth of Astrophysics Articles over the Years")

In [None]:
print("Number of Papers Related to Astrophysics within 1992 - 2021 is ",data.shape[0])

## The Growth in Machine Learning in Astrophysics

#### Filter the datasset with any machine learning techniques applied in the astorphysics articles.

In [None]:
data['adaptive resonance theory network'] = data['abstract'].str.contains('adaptive resonance theory network', flags=re.I)
data['adaptive resonance theory network'].aggregate('sum')

In [None]:
data['association rule learning'] = data['abstract'].str.contains('association rule learning', flags=re.I)
data['association rule learning'].aggregate('sum')

In [None]:
# AdaBoost
data['AdaBoost'] = data['abstract'].str.contains('AdaBoost', flags=re.I)
data['AdaBoost'].aggregate('sum')

In [None]:
data['auto critic'] = data['abstract'].str.contains('auto critic', flags=re.I)
data['auto critic'].aggregate('sum')

In [None]:
data['autoencoder'] = data['abstract'].str.contains('autoencoder', flags=re.I)
data['autoencoder'].aggregate('sum')

In [None]:
data['averaged one-dependence estimators'] = data['abstract'].str.contains('averaged one-dependence estimators', flags=re.I)
data['averaged one-dependence estimators'].aggregate('sum')

In [None]:
data['Bayesian network'] = data['abstract'].str.contains('Bayesian network', flags=re.I)
data['Bayesian network'].aggregate('sum')

In [None]:
data['convolutional neural network'] = data['abstract'].str.contains('convolutional neural network', flags=re.I)
data['convolutional neural network'].aggregate('sum')

In [None]:
# Density-based spatial clustering of applications with noise
data['DBSCAN'] = data['abstract'].str.contains('DBSCAN', flags=re.I)
data['DBSCAN'].aggregate('sum')

In [None]:
data['decision trees'] = data['abstract'].str.contains('decision trees', flags=re.I)
data['decision trees'].aggregate('sum')

In [None]:
# deep q-network
data['DQN'] = data['abstract'].str.contains('DQN', flags=re.I)
data['DQN'].aggregate('sum')

In [None]:
data['discriminant analysis'] = data['abstract'].str.contains('discriminant analysis', flags=re.I)
data['discriminant analysis'].aggregate('sum')

In [None]:
data['expectation maximization'] = data['abstract'].str.contains('expectation maximization', flags=re.I)
data['expectation maximization'].aggregate('sum')

In [None]:
data['factor analysis'] = data['abstract'].str.contains('factor analysis', flags=re.I)
data['factor analysis'].aggregate('sum')

In [None]:
# Generative adversarial networks
data['GANs'] = data['abstract'].str.contains('GANs', flags=re.I)
data['GANs'].aggregate('sum')

In [None]:
data['hierarchical clustering'] = data['abstract'].str.contains('hierarchical clustering', flags=re.I)
data['hierarchical clustering'].aggregate('sum')

In [None]:
data['hopfield network'] = data['abstract'].str.contains('hopfield network', flags=re.I)
data['hopfield network'].aggregate('sum')

In [None]:
data['k-means'] = data['abstract'].str.contains('k-means', flags=re.I)
data['k-means'].aggregate('sum')

In [None]:
data['k-medians'] = data['abstract'].str.contains('k-medians', flags=re.I)
data['k-medians'].aggregate('sum')

In [None]:
data['k-medoids'] = data['abstract'].str.contains('k-mediods', flags=re.I)
data['k-medoids'].aggregate('sum')

In [None]:
data['latent semantic indexing'] = data['abstract'].str.contains('latent semantic indexing', flags=re.I)
data['latent semantic indexing'].aggregate('sum')

In [None]:
# learning vector quantization
data['LVQ'] = data['abstract'].str.contains('LVQ', flags=re.I)
data['LVQ'].aggregate('sum')

In [None]:
data['local outlier factor'] = data['abstract'].str.contains('local outlier factor', flags=re.I)
data['local outlier factor'].aggregate('sum')

In [None]:
data['local regression'] = data['abstract'].str.contains('local regression', flags=re.I)
data['local regression'].aggregate('sum')

In [None]:
data['logistic regression'] = data['abstract'].str.contains('logistic regression', flags=re.I)
data['logistic regression'].aggregate('sum')

In [None]:
# long short-term memoery/ LSTM network
data['LSTM'] = data['abstract'].str.contains('LSTM', flags=re.I)
data['LSTM'].aggregate('sum')

In [None]:
data['Markov random field'] = data['abstract'].str.contains('Markov random field', flags=re.I)
data['Markov random field'].aggregate('sum')

In [None]:
data['Monte-Carlo tree search'] = data['abstract'].str.contains('Monte-Carlo tree search', flags=re.I)
data['Monte-Carlo tree search'].aggregate('sum')

In [None]:
data['multidimensional scaling'] = data['abstract'].str.contains('multidimensional scaling', flags=re.I)
data['multidimensional scaling'].aggregate('sum')

In [None]:
data['multivariant adaptive regression splines'] = data['abstract'].str.contains('multivariant adaptive regression splines', flags=re.I)
data['multivariant adaptive regression splines'].aggregate('sum')

In [None]:
data['Naive Bayes'] = data['abstract'].str.contains('Naive Bayes', flags=re.I)
data['Naive Bayes'].aggregate('sum')

In [None]:
# K-nearest neighbour
data['KNN'] = data['abstract'].str.contains('KNN', flags=re.I)
data['KNN'].aggregate('sum')

In [None]:
data['neural actor-critic'] = data['abstract'].str.contains('neural actor-critic', flags=re.I)
data['neural actor-critic'].aggregate('sum')

In [None]:
data['decision stump'] = data['abstract'].str.contains('decision stump', flags=re.I)
data['decision stump'].aggregate('sum')

In [None]:
data['perceptron'] = data['abstract'].str.contains('perceptron', flags=re.I)
data['perceptron'].aggregate('sum')

In [None]:
data['policy gradient estimation'] = data['abstract'].str.contains('policy gradient estimation', flags=re.I)
data['policy gradient estimation'].aggregate('sum')

In [None]:
# Principal component analysis
data['PCA'] = data['abstract'].str.contains('PCA', flags=re.I)
data['PCA'].aggregate('sum')

In [None]:
# probabilistic latent semantic indexing
data['PLSI'] = data['abstract'].str.contains('PLSI', flags=re.I)
data['PLSI'].aggregate('sum')

In [None]:
data['projection pursuit'] = data['abstract'].str.contains('projection pursuit', flags=re.I)
data['projection pursuit'].aggregate('sum')

In [None]:
data['q-learning'] = data['abstract'].str.contains('q-learning', flags=re.I)
data['q-learning'].aggregate('sum')

In [None]:
# radial basis function
data['RBF'] = data['abstract'].str.contains('RBF', flags=re.I)
data['RBF'].aggregate('sum')

In [None]:
data['random forest'] = data['abstract'].str.contains('random forest', flags=re.I)
data['random forest'].aggregate('sum')

In [None]:
## recurrent neural networks
data['recurrent neural networks'] = data['abstract'].str.contains('recurrent neural networks', flags=re.I)
data['recurrent neural networks'].aggregate('sum')

In [None]:
data['Boltzmann machine'] = data['abstract'].str.contains('Boltzmann machine', flags=re.I)
data['Boltzmann machine'].aggregate('sum')

In [None]:
# State-Action-Reward-State-Action SARSA
data['SARSA'] = data['abstract'].str.contains('SARSA', flags=re.I)
data['SARSA'].aggregate('sum')

In [None]:
data['spherical k-means'] = data['abstract'].str.contains('spherical k-means', flags=re.I)
data['spherical k-means'].aggregate('sum')

In [None]:
data['stepwise regression'] = data['abstract'].str.contains('stepwise regression', flags=re.I)
data['stepwise regression'].aggregate('sum')

In [None]:
# support vector machine
data['SVM'] = data['abstract'].str.contains('SVM', flags=re.I)
data['SVM'].aggregate('sum')

In [None]:
# artificial neural network 
data['artificial neural network'] = data['abstract'].str.contains('artificial neural network', flags=re.I)
data['artificial neural network'].aggregate('sum')

In [None]:
data['temporal difference learning'] = data['abstract'].str.contains('temporal difference learning', flags=re.I)
data['temporal difference learning'].aggregate('sum')

In [None]:
data['ZeroR'] = data['abstract'].str.contains('ZeroR', flags=re.I)
data['ZeroR'].aggregate('sum')

In [None]:
data.groupby('Year')['AdaBoost','artificial neural network','adaptive resonance theory network','association rule learning','auto critic','autoencoder',
                      'averaged one-dependence estimators','Bayesian network','convolutional neural network','DBSCAN',
'decision trees','DQN','discriminant analysis','expectation maximization','factor analysis','hierarchical clustering',
'hopfield network','k-means','k-medians','k-medoids','latent semantic indexing','LVQ','local outlier factor',
                      'local regression','logistic regression','LSTM','Markov random field','Monte-Carlo tree search',
                      'multidimensional scaling','multivariant adaptive regression splines','Naive Bayes','KNN','GANs',
                      'neural actor-critic','decision stump','perceptron','policy gradient estimation','PCA','PLSI',
                      'projection pursuit','q-learning','RBF','random forest','Boltzmann machine','SARSA','recurrent neural networks',
                      'spherical k-means','stepwise regression','SVM','temporal difference learning','ZeroR'].aggregate('sum').plot.line()

plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('ML Techniques Over Time')
plt.axvline(x = 2001, color = 'red',ls = "--")
plt.axvline(x = 2011, color = 'blue',ls = "--")
plt.axvline(x = 2021, color = 'green',ls = "--")

plt.rcParams["figure.figsize"] = (20,15)

#### The Graph above indicates the changes in machine learning techniques over time, convolutional neural network has spiked in recent years, follows by PCA.

In [None]:
data1 = data[data['Year'].between(1992,2001)]

data1.groupby('Year')['hierarchical clustering','PCA','artificial neural network','GANs',
                      'expectation maximization','discriminant analysis',
                     'decision trees'].aggregate('sum').plot.line()

plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('ML Techniques')
plt.title('Top 7 ML Techniques in Astrophysics from 1992 to 2001')
plt.rcParams["figure.figsize"] = (10,10)

##### It shows the top seven machine learning techniques during the period 1992 to 2001. Hierarchical clustering has been popular during 1992 to 1996, whereas PCA has skyrocketed from year 1996 and ranked among the top machine learning techniques over these 10 years’ time. We can see that PCA is among the top with the close amount of usage of hierarchical clustering, follows by artificial neural network.

In [None]:
data2 = data[data['Year'].between(2002,2011)]

data2.groupby('Year')['PCA','hierarchical clustering','artificial neural network','k-means','perceptron',
                     'decision trees','GANs','discriminant analysis','factor analysis','local regression'].aggregate('sum').plot.line()

plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('ML Techniques')
plt.title('Top 10 ML Techniques in Astrophysics from 2002 to 2011')
plt.rcParams["figure.figsize"] = (10,10)

##### The graph above demonstrates a constant growth in PCA, the application of hierarchical clustering declines since 2002. Another point note is that the usage of artificial neural network and k-means have been increased gradually as can be seen from the graph (in green line and red line respectively).


In [None]:
data3 = data[data['Year'].between(2012,2021)]

data3.groupby('Year')['PCA','convolutional neural network','artificial neural network','autoencoder',
                      'hierarchical clustering','k-means','perceptron','DBSCAN',
                      'decision trees','GANs'].aggregate('sum').plot.line()

plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('ML Techniques')
plt.title('Top 10 ML Techniques in Astrophysics from 2012 to 2021')
plt.rcParams["figure.figsize"] = (10,10)

##### There is a huge change during year 2012 to 2021, with PCA and artificial neural network growing steadily, convolutional neural network has spiked form year 2016 and top the graph throughout these 10 years period. 

## Machine Learning Techniques's Topic Modeling - LDA

#### To further investigate on the topic changes per the top machine learning technique of each decade.

### Topics applied Hierarchical Clustering (1992 - 2001)

In [None]:
data1['is_HC']=data1['abstract'].apply(lambda x:1 if "hierarchical clustering" in x else 0)
HC_papers=data1[data1['is_HC']==1]
HC_papers=HC_papers.reset_index(drop=True)

print("Number of Papers with Hierarchical Clustering on Arxiv is ",HC_papers.shape[0])

In [None]:
print("The First Paper applied Hierarchical Clustering in ",min(HC_papers['DateTime']))
print("The Recent Paper applied Hierarchical Clustering in ",max(HC_papers['DateTime']))

In [None]:
import nltk
from nltk.corpus import stopwords
import spacy
from nltk.util import ngrams

In [None]:
def removeStopWords(text,stop_words):
    words=text.split()
    filtered_words=[word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [None]:
titles=HC_papers['title'].tolist()
stop_words = set(stopwords.words('english')) 
titles=[title.lower() for title in titles] ### Lower Casing the Title
titles=[removeStopWords(title,stop_words) for title in titles]

In [None]:
def generateNGram(text,n=2):
    tokens=text.split(" ")
    ngrams = zip(*[tokens[i:] for i in range(n)])
    n_grams= ["_".join(ngram) for ngram in ngrams]
    n_grams=[ngram for ngram in n_grams if not ngram.startswith("_")]
    n_grams=[ngram for ngram in n_grams if not ngram.endswith("_")]
    return n_grams

def getMostCommon(reviews_list,topn=20):
    reviews=" ".join(reviews_list)
    tokenised_reviews=reviews.split(" ")
    freq_counter=Counter(tokenised_reviews)
    return freq_counter.most_common(topn)

In [None]:
from collections import Counter

In [None]:
bigrams_list=[" ".join(generateNGram(title,2)) for title in titles]
topn=10
top_bigrams=getMostCommon(bigrams_list,topn=topn)
top_bigrams_df=pd.DataFrame()
top_bigrams_df['words']=[val[0] for val in top_bigrams]
top_bigrams_df['Frequency']=[val[1] for val in top_bigrams]
px.bar(data_frame=top_bigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Bigrams in Papers applied Hierarchical Clustering")

In [None]:
trigrams_list=[" ".join(generateNGram(title.replace(":",""),3)) for title in titles]
topn=10
top_trigrams=getMostCommon(trigrams_list,topn=topn)
top_trigrams_df=pd.DataFrame()
top_trigrams_df['words']=[val[0] for val in top_trigrams]
top_trigrams_df['Frequency']=[val[1] for val in top_trigrams]
top_trigrams_df=top_trigrams_df[top_trigrams_df["words"]!=""]
px.bar(data_frame=top_trigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Trigrams in Papers applied Hierarchical Clustering")

In [None]:
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis

In [None]:
def tokenise(sentences):
    return [gensim.utils.simple_preprocess(sentence, deacc=True,max_len=50) for sentence in sentences]

In [None]:
tokenised_sentences=tokenise(HC_papers['title'].tolist())
tokenised_sentences[0]

In [None]:
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatise(sentence,stop_words,allowed_postags=None):
    doc=nlp(sentence)
    #print(sentence)
    if allowed_postags!=None:
        tokens = [token.lemma_ for token in doc if (token.pos_ in allowed_postags) and (token.text not in stop_words)]
    if allowed_postags==None:
        tokens= [token.lemma_ for token in doc if (token.text not in stop_words)]
    return tokens

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
sentences=[" ".join(tokenised_sentence) for tokenised_sentence in tokenised_sentences]
lemmatised_sentences=[lemmatise(sentence,stop_words) for sentence in sentences]
lemmatised_sentences[0]

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(lemmatised_sentences,min_count=2) 
trigram = gensim.models.Phrases(bigram[lemmatised_sentences],min_count=2)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
bigrams_words=[bigram_mod[sentence] for sentence in lemmatised_sentences]

trigrams_words=[trigram_mod[sentence] for sentence in bigrams_words]

In [None]:
id2word = corpora.Dictionary(trigrams_words)
corpus = [id2word.doc2bow(text) for text in trigrams_words]
[(id2word[id], freq) for id, freq in corpus[0]] 

In [None]:
def compute_coherence_values(id2word, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
models,coherence=compute_coherence_values(id2word,corpus,trigrams_words,limit=20,start=2,step=2)
x = range(2, 20, 2)
plt.plot(x, coherence)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=trigrams_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def format_topics_sentences(texts,ldamodel=lda_model, corpus=corpus):
    sent_topics_df = pd.DataFrame()

    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(HC_papers['title'].tolist(),ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
topic_counts=df_dominant_topic['Dominant_Topic'].value_counts().reset_index().rename(columns={'index':'Topic','Dominant_Topic':'Number of Documents'})
topic_counts['percentage_contribution']=(topic_counts['Number of Documents']/topic_counts['Number of Documents'].sum())*100
topic_counts

In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, topic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
num_topics=4

topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
    topic_summaries += [tmp[:5]]
    print

In [None]:
from sklearn.manifold import TSNE

topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

arr = pd.DataFrame(topic_weights).fillna(0).values

topic_num = np.argmax(arr, axis=1)

tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
sent_topics_df=pd.DataFrame()
sent_topics_df['Text']=HC_papers['title'].tolist()
sent_topics_df['tsne_x']=tsne_lda[:,0]
sent_topics_df['tsne_y']=tsne_lda[:,1]
sent_topics_df['Topic_No']=topic_num
sent_topics_df=pd.merge(sent_topics_df,df_dominant_topic,on="Text")
sent_topics_df.head()

### Topic applied PCA (1992 - 2001)

In [None]:
data1['is_PCA']=data1['abstract'].apply(lambda x:1 if "PCA" in x else 0)
PCA_papers=data1[data1['is_PCA']==1]
PCA_papers=PCA_papers.reset_index(drop=True)

print("Number of Papers with PCA on Arxiv is ",PCA_papers.shape[0])

In [None]:
print("The First Paper applied PCA in ",min(PCA_papers['DateTime']))
print("The Recent Paper applied PCA in ",max(PCA_papers['DateTime']))

In [None]:
titles=PCA_papers['title'].tolist()
stop_words = set(stopwords.words('english')) 
titles=[title.lower() for title in titles]
titles=[removeStopWords(title,stop_words) for title in titles]

In [None]:
bigrams_list=[" ".join(generateNGram(title,2)) for title in titles]
topn=10
top_bigrams=getMostCommon(bigrams_list,topn=topn)
top_bigrams_df=pd.DataFrame()
top_bigrams_df['words']=[val[0] for val in top_bigrams]
top_bigrams_df['Frequency']=[val[1] for val in top_bigrams]
px.bar(data_frame=top_bigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Bigrams in Papers applied PCA")

In [None]:
trigrams_list=[" ".join(generateNGram(title.replace(":",""),3)) for title in titles]
topn=10
top_trigrams=getMostCommon(trigrams_list,topn=topn)
top_trigrams_df=pd.DataFrame()
top_trigrams_df['words']=[val[0] for val in top_trigrams]
top_trigrams_df['Frequency']=[val[1] for val in top_trigrams]
top_trigrams_df=top_trigrams_df[top_trigrams_df["words"]!=""]
px.bar(data_frame=top_trigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Trigrams in Papers applied PCA")

In [None]:
tokenised_sentences=tokenise(PCA_papers['title'].tolist())
tokenised_sentences[0]

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
sentences=[" ".join(tokenised_sentence) for tokenised_sentence in tokenised_sentences]
lemmatised_sentences=[lemmatise(sentence,stop_words) for sentence in sentences]
lemmatised_sentences[0]

In [None]:
bigram = gensim.models.Phrases(lemmatised_sentences,min_count=2) 
trigram = gensim.models.Phrases(bigram[lemmatised_sentences],min_count=2)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
bigrams_words=[bigram_mod[sentence] for sentence in lemmatised_sentences]

trigrams_words=[trigram_mod[sentence] for sentence in bigrams_words]

In [None]:
id2word = corpora.Dictionary(trigrams_words)
corpus = [id2word.doc2bow(text) for text in trigrams_words]
[(id2word[id], freq) for id, freq in corpus[0]] 

In [None]:
models,coherence=compute_coherence_values(id2word,corpus,trigrams_words,limit=20,start=2,step=2)
x = range(2, 20, 2)
plt.plot(x, coherence)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=trigrams_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
df_topic_sents_keywords = format_topics_sentences(PCA_papers['title'].tolist(),ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
topic_counts=df_dominant_topic['Dominant_Topic'].value_counts().reset_index().rename(columns={'index':'Topic','Dominant_Topic':'Number of Documents'})
topic_counts['percentage_contribution']=(topic_counts['Number of Documents']/topic_counts['Number of Documents'].sum())*100
topic_counts

In [None]:
num_topics=4

topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
    topic_summaries += [tmp[:5]]
    print

In [None]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values


# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
sent_topics_df=pd.DataFrame()
sent_topics_df['Text']=PCA_papers['title'].tolist()
sent_topics_df['tsne_x']=tsne_lda[:,0]
sent_topics_df['tsne_y']=tsne_lda[:,1]
sent_topics_df['Topic_No']=topic_num
sent_topics_df=pd.merge(sent_topics_df,df_dominant_topic,on="Text")
sent_topics_df.head()

### Topics applied PCA (2002 - 2011)

In [None]:
data2['is_PCA']=data2['abstract'].apply(lambda x:1 if "PCA" in x else 0)
PCA_papers=data2[data2['is_PCA']==1]
PCA_papers=PCA_papers.reset_index(drop=True)

print("Number of Papers with PCA on Arxiv is ",PCA_papers.shape[0])

In [None]:
print("The First Paper applied PCA in ",min(PCA_papers['DateTime']))
print("The Recent Paper applied PCA in ",max(PCA_papers['DateTime']))

In [None]:
titles=PCA_papers['title'].tolist()
stop_words = set(stopwords.words('english')) 
titles=[title.lower() for title in titles]
titles=[removeStopWords(title,stop_words) for title in titles]

In [None]:
bigrams_list=[" ".join(generateNGram(title,2)) for title in titles]
topn=10
top_bigrams=getMostCommon(bigrams_list,topn=topn)
top_bigrams_df=pd.DataFrame()
top_bigrams_df['words']=[val[0] for val in top_bigrams]
top_bigrams_df['Frequency']=[val[1] for val in top_bigrams]
px.bar(data_frame=top_bigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Bigrams in Papers applied PCA")

In [None]:
trigrams_list=[" ".join(generateNGram(title.replace(":",""),3)) for title in titles]
topn=10
top_trigrams=getMostCommon(trigrams_list,topn=topn)
top_trigrams_df=pd.DataFrame()
top_trigrams_df['words']=[val[0] for val in top_trigrams]
top_trigrams_df['Frequency']=[val[1] for val in top_trigrams]
top_trigrams_df=top_trigrams_df[top_trigrams_df["words"]!=""]
px.bar(data_frame=top_trigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Trigrams in Papers applied PCA")

In [None]:
tokenised_sentences=tokenise(PCA_papers['title'].tolist())
tokenised_sentences[0]

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
sentences=[" ".join(tokenised_sentence) for tokenised_sentence in tokenised_sentences]
lemmatised_sentences=[lemmatise(sentence,stop_words) for sentence in sentences]
lemmatised_sentences[0]

In [None]:
bigram = gensim.models.Phrases(lemmatised_sentences,min_count=2) 
trigram = gensim.models.Phrases(bigram[lemmatised_sentences],min_count=2)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
bigrams_words=[bigram_mod[sentence] for sentence in lemmatised_sentences]

trigrams_words=[trigram_mod[sentence] for sentence in bigrams_words]

In [None]:
id2word = corpora.Dictionary(trigrams_words)
corpus = [id2word.doc2bow(text) for text in trigrams_words]
[(id2word[id], freq) for id, freq in corpus[0]] 

In [None]:
models,coherence=compute_coherence_values(id2word,corpus,trigrams_words,limit=20,start=2,step=2)
x = range(2, 20, 2)
plt.plot(x, coherence)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=trigrams_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
df_topic_sents_keywords = format_topics_sentences(PCA_papers['title'].tolist(),ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
topic_counts=df_dominant_topic['Dominant_Topic'].value_counts().reset_index().rename(columns={'index':'Topic','Dominant_Topic':'Number of Documents'})
topic_counts['percentage_contribution']=(topic_counts['Number of Documents']/topic_counts['Number of Documents'].sum())*100
topic_counts

In [None]:
num_topics=4

topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
    topic_summaries += [tmp[:5]]
    print

In [None]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values


# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
sent_topics_df=pd.DataFrame()
sent_topics_df['Text']=PCA_papers['title'].tolist()
sent_topics_df['tsne_x']=tsne_lda[:,0]
sent_topics_df['tsne_y']=tsne_lda[:,1]
sent_topics_df['Topic_No']=topic_num
sent_topics_df=pd.merge(sent_topics_df,df_dominant_topic,on="Text")
sent_topics_df.head()

### Topics applied CNN (2012 - 2021)

In [None]:
data3['is_CNN']=data3['abstract'].apply(lambda x:1 if "convolutional neural network" in x else 0)
CNN_papers=data3[data3['is_CNN']==1]
CNN_papers=CNN_papers.reset_index(drop=True)

print("Number of Papers with CNN on Arxiv is ",CNN_papers.shape[0])

In [None]:
print("The First Paper applied CNN in ",min(CNN_papers['DateTime']))
print("The Recent Paper applied CNN in ",max(CNN_papers['DateTime']))

In [None]:
titles=CNN_papers['title'].tolist()
stop_words = set(stopwords.words('english')) 
titles=[title.lower() for title in titles] 
titles=[removeStopWords(title,stop_words) for title in titles]

In [None]:
bigrams_list=[" ".join(generateNGram(title,2)) for title in titles]
topn=10
top_bigrams=getMostCommon(bigrams_list,topn=topn)
top_bigrams_df=pd.DataFrame()
top_bigrams_df['words']=[val[0] for val in top_bigrams]
top_bigrams_df['Frequency']=[val[1] for val in top_bigrams]
px.bar(data_frame=top_bigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Bigrams in Papers applied CNN")

In [None]:
trigrams_list=[" ".join(generateNGram(title.replace(":",""),3)) for title in titles]
topn=10
top_trigrams=getMostCommon(trigrams_list,topn=topn)
top_trigrams_df=pd.DataFrame()
top_trigrams_df['words']=[val[0] for val in top_trigrams]
top_trigrams_df['Frequency']=[val[1] for val in top_trigrams]
top_trigrams_df=top_trigrams_df[top_trigrams_df["words"]!=""]
px.bar(data_frame=top_trigrams_df.sort_values("Frequency",ascending=True),x="Frequency",y="words",orientation="h",title="Top "+str(topn)+" Trigrams in Papers applied CNN")

In [None]:
tokenised_sentences=tokenise(CNN_papers['title'].tolist())
tokenised_sentences[0]

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
sentences=[" ".join(tokenised_sentence) for tokenised_sentence in tokenised_sentences]
lemmatised_sentences=[lemmatise(sentence,stop_words) for sentence in sentences]
lemmatised_sentences[0]

In [None]:
bigram = gensim.models.Phrases(lemmatised_sentences,min_count=2) 
trigram = gensim.models.Phrases(bigram[lemmatised_sentences],min_count=2)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
bigrams_words=[bigram_mod[sentence] for sentence in lemmatised_sentences]

trigrams_words=[trigram_mod[sentence] for sentence in bigrams_words]

In [None]:
id2word = corpora.Dictionary(trigrams_words)
corpus = [id2word.doc2bow(text) for text in trigrams_words]
[(id2word[id], freq) for id, freq in corpus[0]] 

In [None]:
models,coherence=compute_coherence_values(id2word,corpus,trigrams_words,limit=20,start=2,step=2)
x = range(2, 20, 2)
plt.plot(x, coherence)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=trigrams_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
df_topic_sents_keywords = format_topics_sentences(CNN_papers['title'].tolist(),ldamodel=lda_model, corpus=corpus)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
topic_counts=df_dominant_topic['Dominant_Topic'].value_counts().reset_index().rename(columns={'index':'Topic','Dominant_Topic':'Number of Documents'})
topic_counts['percentage_contribution']=(topic_counts['Number of Documents']/topic_counts['Number of Documents'].sum())*100
topic_counts

In [None]:
num_topics=4

topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
    topic_summaries += [tmp[:5]]
    print

In [None]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values


# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

In [None]:
sent_topics_df=pd.DataFrame()
sent_topics_df['Text']=CNN_papers['title'].tolist()
sent_topics_df['tsne_x']=tsne_lda[:,0]
sent_topics_df['tsne_y']=tsne_lda[:,1]
sent_topics_df['Topic_No']=topic_num
sent_topics_df=pd.merge(sent_topics_df,df_dominant_topic,on="Text")
sent_topics_df.head()

#### The above topic modelling shows the top machine learning technique’s topic changes per decade, from 1992 to 2001; 2002 – 2011; and 2012 to 2021 respectively. Topics are around ray and galaxy across decades, which have been appeared across these three decades. The major difference is due to the change in machine learning techniques and the improvement of measurement tools applied in the research.