In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import json
import re
from collections import defaultdict

import tensorflow as tf
import tensorflow_hub as hub
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt
import seaborn as sb


In [None]:
# read in the metadata
df_meta = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')
df_meta.head()

In [None]:
# read in the json schema
with open('/kaggle/input/CORD-19-research-challenge/json_schema.txt') as open_json:
    json_schema = list(open_json)


In [None]:
# read in the studies
studies_biorxiv = glob('/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/**/*.json', recursive=True)
studies_comm_use_subset = glob('/kaggle/input/CORD-19-research-challenge/comm_use_subset/**/*.json', recursive=True)
studies_custom_license = glob('/kaggle/input/CORD-19-research-challenge/custom_license/**/*.json', recursive=True)
studies_noncomm_use_subset = glob('/kaggle/input/CORD-19-research-challenge/noncomm_use_subset/**/*.json', recursive=True)


In [None]:
# read in the first study and create dataframe for studies
with open(studies_biorxiv[0]) as file:
    first_study = json.load(file)
    
df_studies = pd.DataFrame.from_dict(first_study, orient='index').T
df_studies


In [None]:
# add individual studies into dataframe
for study in studies_biorxiv[1:]:
    df_temp = pd.read_json(study, orient='index').T
    df_studies = pd.concat([df_studies, df_temp], ignore_index=True, sort=False)
    

In [None]:
len(df_studies)

In [None]:
# remove any unwanted column
df_studies.drop(columns=['back_matter', 'bib_entries', 'ref_entries'], inplace=True)


In [None]:
df_studies.head()

In [None]:
# create new columns
df_studies['abstract_text'] = df_studies['abstract'].apply(lambda x: ','.join([i['text'] for i in x]) if x != [] else np.nan)
df_studies['title'] = df_studies['metadata'].apply(lambda x: x['title'] if (x != {} or x['title'] != '')  else np.nan)
df_studies['authors'] = df_studies['metadata'].apply(lambda x: x['authors'] if x != [] else np.nan)
df_studies['authors_list'] = df_studies['authors'].apply(lambda x: [' '.join([value if type(value) == str else 
                                                                    (value[0] if (len(value) > 0 and type(value) == list) 
                                                                    else (value+'; ' if key == 'last' else ''))
                                                                    for key, value in i.items()]).strip() for i in x]
                                                                    if x != [] else np.nan)
df_studies['full_text'] = df_studies['body_text'].apply(lambda x: ' '.join(['\n'.join([value if key == 'text' else ''
                                                                              for key, value in i.items()]) for i in x])
                                                                              if x != [] else np.nan)


In [None]:
#drop any irrelevant columns
df_studies.drop(columns=['authors', 'body_text', 'metadata', 'abstract'], inplace=True)


In [None]:
# creating a temporary dataframe with sha and journal
df_meta_journal = df_meta[['sha', 'journal']].copy()
# merging the journal to the matching paper
df_meta_journal.rename(columns={'sha': 'paper_id'}, inplace=True)
df_data = df_studies.merge(df_meta_journal, on='paper_id', how='inner')

In [None]:
df_data['full_text'][0]

In [None]:
# overview of new dataframe
df_data.head()


## Preprocessing
### Missing values

In [None]:
# get an overview of NaNs in dataset
df_data.isnull().sum()


In [None]:
# see how many titles are missing
df_data[df_data['title'] == ''].shape


In [None]:
# see how many paper_ids are missing
for item in df_data['paper_id']:
    if len(item) < 5:
        print(item)


In [None]:
# see how many paper_ids are missing
for item in df_data['full_text']:
    if len(item) < 5:
        print(item)
  

In [None]:
# replace all missing values in abstract_text with empty string
df_data['abstract_text'].fillna('', inplace=True)



![](http://)The important part is the full text so it is good to know which other variables have missing data, but there is no need to take any action for now.

### Duplicate values

In [None]:
# check whether there are any duplicate full_texts
df_data[df_data['full_text'].duplicated() == True]


In [None]:
# check the number of remaining rows
print(df_data.shape)

# check whether there are any duplicates left
print(df_data[df_data['paper_id'].duplicated() == True])
print(df_data[df_data['full_text'].duplicated() == True])


> ### Finding similar titles with TensorFlow Universial Sentence Encoder


In [None]:
# convert the titles into a list
title_biorxiv = df_data['title'].tolist()
title_biorxiv

In [None]:
# load the universal sentence encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)


In [None]:
# get embeddings for the list of abstracts
def embed(input):
    '''
    Function to apply the universal sentence encoder to the input dataset
    which will embed the input
    Args: input = list of words/sentences/paragraphs
    Returns: model(input) = dense matrix with embeddings
    '''
    return model(input)

# based on this kernel https://www.kaggle.com/mobassir/mining-covid-19-scientific-papers
def create_similar_titles_dict(title, all_titles, similarity_matrix, topk=5):
    '''
    Function to create a dataframe that contains the topk most similar titles
    for each title
    Args: title = str; current title to find similarities to 
          all_titles = list; contains all titles
          similarity_matrix = dense matrix; contains pearson correlations for each title pair
          topk = int; default value 5; number of similar titles to store in dataframe
    Returns: similar_titles_dict = dict
    '''
    # get the index of the 
    index = all_titles.index(title)
    all_similar_titles = similarity_matrix[index]
    topk_similar_titles_index = np.argpartition(all_similar_titles, -topk)[-topk:]
    titles_list = [all_titles[x] for x in topk_similar_titles_index]
    
    similar_titles_dict = defaultdict()
    similar_titles_dict['title'] = title
    similar_titles_dict['similar_titles'] = titles_list
    
    return similar_titles_dict
    

In [None]:
# embed titles
title_embeddings = embed(title_biorxiv)

# calulate similarity matrix
similarity_matrix = cosine_similarity(title_embeddings)

# create empty dataframe
all_similar_titles_df = pd.DataFrame(None, columns=['title', 'similar_titles'])

# print top k similar titles
for i in range(len(title_biorxiv)):
    dict_similar = create_similar_titles_dict(title_biorxiv[i], title_biorxiv, similarity_matrix, topk=20)
    all_similar_titles_df = all_similar_titles_df.append(dict_similar, ignore_index=True)

# show the dataframe containing similar titles for all titles
all_similar_titles_df
    

In [None]:
print(all_similar_titles_df['similar_titles'][1])

### Cleaning and normalizing full text

In [None]:
def clean_text(text):
    '''
    Function to clean text
    Args: text = str
    Returns: clean_text = str
    '''
    # Normalize the text by converting all letters to lower case
    low_caps = text.lower()
    # Remove all punctuation and add spaces instead
    no_punct = re.sub(r'[^a-zA-Z0-9]', ' ', low_caps)
    # Remove any excess white space including tabs and creating list of words in text
    no_whitespace = no_punct.split()
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    clean_text = [lemmatizer.lemmatize(word) for word in no_whitespace]
    
    return clean_text


### Topic Modelling with LDA

In [None]:
# adjust stop words so that document specific words won't pop up
doc_spec_words = ['biorxiv', 'medrxiv', 'et', 'al', 'fig', 'figure']
stop_words = set(stopwords.words('english')).union(doc_spec_words)
lemmatizer = WordNetLemmatizer()
# as seen on https://stackoverflow.com/questions/50155188/lemmatization-on-countvectorizer-doesnt-remove-stopwords
lemm_stop_words = [lemmatizer.lemmatize(stop_word) for stop_word in stop_words]
print(lemm_stop_words)


In [None]:
# Create a matrix of token counts from the cleaned text column
# as seen on https://stackabuse.com/python-for-nlp-topic-modeling/
count_vect = CountVectorizer(tokenizer=clean_text, max_df=0.8, min_df=4, stop_words=lemm_stop_words)
token_matrix = count_vect.fit_transform(df_data['full_text'].values.astype('U'))
token_matrix


In [None]:
# instantiate and fit the LDA model
LDA = LatentDirichletAllocation(n_components=20, random_state=42)
LDA.fit(token_matrix)


In [None]:
# print the topk words for each topic
# as seen on https://stackabuse.com/python-for-nlp-topic-modeling/
topk = 20
for i,topic in enumerate(LDA.components_):
    print(f'Top 20 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-topk:]])
    print('\n')


### Topic Modelling with Non-Negative Matrix factorization