In [41]:
import pandas as pd
# data manipulation, gathering
import numpy as np # for array manipulation# for dataframe manipulation/reading in data
import json # for reading in Data
from itertools import islice # for slicing and dicing JSON records
import os # for getting the filepath information
import re # to identify characters that are to be removed
import nltk # for preprocessing of textual data
from nltk.corpus import stopwords # for removing stopwords
from nltk.tokenize import word_tokenize # for tokenizing text
from nltk.stem import WordNetLemmatizer # for lemmatizing text
from sklearn.feature_extraction.text import TfidfVectorizer # for featurizing text
from sklearn.metrics.pairwise import cosine_similarity # for getting similarity score
from sklearn.decomposition import PCA #for dimensionality reduction
from sklearn.cluster import KMeans #for clustering
from sklearn.manifold import TSNE #For reducing to 2 dimensions for plotting


In [42]:
df = pd.read_csv('initial_df.csv',dtype={'id': "str"})

In [43]:
final_df = pd.read_csv('final_df.csv')

In [44]:
idf = df.reset_index()

dataorg = idf[idf['id']=="0704.0002"]
hula = pd.DataFrame()
hula = hula.append(dataorg,ignore_index=True)
orgtitle = hula['title'][0]

In [67]:
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the data
tfidf_matrix = tfidf_vectorizer.fit_transform(final_df['final_text'])

In [68]:
def get_recommendations(paper_id:str,tfidf_matrix,num_rec):
    idx = final_df.index[final_df['id'] == paper_id][0]
    sim = cosine_similarity(tfidf_matrix, tfidf_matrix[idx])
    sim = sim.reshape(sim.shape[0])
    top_n_idx = np.argsort(-sim)[1:num_rec+1]
    top_n_id = [final_df['id'][x] for x in top_n_idx]
    return top_n_id

In [69]:
tfidf_vectorizer2 = TfidfVectorizer(max_features=10000)

# Generate the tf-idf vectors for the data
tfidf_matrix2 = tfidf_vectorizer2.fit_transform(final_df['final_text'])

In [70]:
rec = get_recommendations(704.0001,tfidf_matrix,1000)
idxs = list(final_df[final_df['id'].isin(rec)].index)
rec_matrix = tfidf_matrix2[idxs]

In [71]:
pca = PCA(n_components=0.95, random_state=42) #Keep 95% of the variance
reduced_matrix = pca.fit_transform(rec_matrix.toarray())

In [72]:
k = 10 # selectable
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(reduced_matrix)

In [73]:
tsne = TSNE(perplexity=100, random_state=42)
two_dim_matrix = tsne.fit_transform(reduced_matrix)



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [74]:
import seaborn as sns
import matplotlib.pyplot as plt



In [75]:
import plotly.express as px


Topic Modelling

In [76]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
vectorizers = []
    
for x in range(0, k):
    # Creating a vectorizer
    vectorizers.append(CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [78]:
topic_df = pd.DataFrame()
topic_df['id'] =df[final_df['id'].isin(rec)]['id']
topic_df['title'] = df[final_df['id'].isin(rec)]['title']
topic_df['text'] = df[final_df['id'].isin(rec)]['title']+" "+df[final_df['id'].isin(rec)]['abstract']
topic_df['cluster'] = y_pred

In [79]:
vectorized_data = []

for current_cluster, cvec in enumerate(vectorizers):
    try:
        vectorized_data.append(cvec.fit_transform(topic_df.loc[topic_df['cluster'] == current_cluster, 'text']))
    except Exception as e:
        print("Not enough instances in cluster: " + str(current_cluster))
        vectorized_data.append(None)

In [80]:
NUM_TOPICS_PER_CLUSTER = 5 #choose

lda_models = []
for x in range(0, k):
    # Latent Dirichlet Allocation Model
    lda = LatentDirichletAllocation(n_components=NUM_TOPICS_PER_CLUSTER, max_iter=10, learning_method='online',verbose=False, random_state=42)
    lda_models.append(lda)

In [81]:
clusters_lda_data = []

for current_cluster, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_cluster))
    
    if vectorized_data[current_cluster] != None:
        clusters_lda_data.append((lda.fit_transform(vectorized_data[current_cluster])))

In [82]:
def selected_topics(model, vectorizer, top_n=3):
    current_words = []
    keywords = []
    
    for idx, topic in enumerate(model.components_):
        words = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        for word in words:
            if word[0] not in current_words:
                keywords.append(word)
                current_words.append(word[0])
                
    keywords.sort(key = lambda x: x[1])  
    keywords.reverse()
    return_values = []
    for x in keywords:
        return_values.append(x[0])
    return " ".join(return_values)


In [83]:
all_keywords = []
for current_vectorizer, lda in enumerate(lda_models):
    #print("Current Cluster: " + str(current_vectorizer))

    if vectorized_data[current_vectorizer] != None:
        all_keywords.append(selected_topics(lda, vectorizers[current_vectorizer]))

In [84]:
cluster_keyword = {x:all_keywords[x] for x in range(k)}
word_pred = list(map(cluster_keyword.get, y_pred))
topic_df['keywords'] = word_pred

In [85]:
fig = px.scatter(topic_df, x=two_dim_matrix[:,0], y=two_dim_matrix[:,1], color='keywords',
                 hover_data=['id','title'],
                 height= 700, width=1200,
                title = "Clustered Papers",)

fig.update_layout(
    coloraxis_colorbar=dict(
        title_font=dict(size=20), # Set the font size of the color legend title to 20
        tickfont=dict(size=16) # Set the font size of the color legend tick labels to 16
    ),
    legend=dict(
        font=dict(size=11) # Set the font size of the marker legend to 16
    )
)
fig.update_layout(
    legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="right", x=1)
)
fig.show()

In [87]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category10

# Load data


# Define colors
colors = Category10[len(topic_df['keywords'].unique())]

# Create Bokeh figure
p = figure(title='Clustered Papers', width=1600, height=1200)

# Create ColumnDataSource
source = ColumnDataSource(data={
    'x': two_dim_matrix[:,0],
    'y': two_dim_matrix[:,1],
    'id': topic_df['id'],
    'title': topic_df['title'],
    'keywords': topic_df['keywords'],
})

# Add scatter plot
p.scatter('x', 'y', source=source, color='keywords', legend_group='keywords', size=10, fill_alpha=0.8, line_alpha=0.8)

# Add hover tool
hover = HoverTool(tooltips=[('ID', '@id'), ('Title', '@title')])
p.add_tools(hover)

# Show the plot
show(p)
