In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import json
import re

import tensorflow as tf
import tensorflow_hub as hub

from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sb


In [None]:
# read in the metadata
df_meta = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')
df_meta.head()

In [None]:
# read in the json schema
with open('/kaggle/input/CORD-19-research-challenge/json_schema.txt') as open_json:
    json_schema = list(open_json)


In [None]:
# read in the studies
studies_biorxiv = glob('/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/*.json', recursive=True)
len(studies_biorxiv)


In [None]:
# read in the first study and create dataframe for studies
with open(studies_biorxiv[0]) as file:
    first_study = json.load(file)
    
df_studies = pd.DataFrame.from_dict(first_study, orient='index').T
df_studies


In [None]:
# add individual studies into one list
studies_list = []

for study in studies_biorxiv[1:]:
    df_temp = pd.read_json(study, orient='index').T
    df_studies = pd.concat([df_studies, df_temp], ignore_index=True, sort=False)
    

In [None]:
# remove any unwanted column
df_studies.drop(columns=['back_matter', 'bib_entries', 'ref_entries'], inplace=True)


In [None]:
df_studies.head()

In [None]:
# create new columns
df_studies['abstract_text'] = df_studies['abstract'].apply(lambda x: ','.join([i['text'] for i in x]) if x != [] else np.nan)
df_studies['title'] = df_studies['metadata'].apply(lambda x: x['title'] if (x != {} or x['title'] != '')  else np.nan)
df_studies['authors'] = df_studies['metadata'].apply(lambda x: x['authors'] if x != [] else np.nan)
df_studies['authors_list'] = df_studies['authors'].apply(lambda x: [' '.join([value if type(value) == str else 
                                                                    (value[0] if (len(value) > 0 and type(value) == list) 
                                                                    else (value+'; ' if key == 'last' else ''))
                                                                    for key, value in i.items()]).strip() for i in x]
                                                                    if x != [] else np.nan)
df_studies['full_text'] = df_studies['body_text'].apply(lambda x: ' '.join(['\n'.join([value if key == 'text' else ''
                                                                              for key, value in i.items()]) for i in x])
                                                                              if x != [] else np.nan)


In [None]:
#drop any irrelevant columns
df_studies.drop(columns=['authors', 'body_text', 'metadata', 'abstract'], inplace=True)


In [None]:
# creating a temporary dataframe with sha and journal
df_meta_journal = df_meta[['sha', 'journal']].copy()
# merging the journal to the matching paper
df_meta_journal.rename(columns={'sha': 'paper_id'}, inplace=True)
df_data = df_studies.merge(df_meta_journal, on='paper_id', how='inner')

In [None]:
df_data['full_text'][0]

In [None]:
# overview of new dataframe
df_data.head()


## Preprocessing
### Missing values

In [None]:
# get an overview of NaNs in dataset
df_data.isnull().sum()


In [None]:
# see how many titles are missing
df_data[df_data['title'] == ''].shape


In [None]:
# see how many paper_ids are missing
for item in df_data['paper_id']:
    if len(item) < 5:
        print(item)


In [None]:
# see how many paper_ids are missing
for item in df_data['full_text']:
    if len(item) < 5:
        print(item)
  

In [None]:
# replace all missing values in abstract_text with empty string
df_data['abstract_text'].fillna('', inplace=True)



![](http://)The important part is the full text so it is good to know which other variables have missing data, but there is no need to take any action for now.

### Duplicate values

In [None]:
# check whether there are any duplicate full_texts
df_data[df_data['full_text'].duplicated() == True]


In [None]:
# check the number of remaining rows
print(df_data.shape)

# check whether there are any duplicates left
print(df_data[df_data['paper_id'].duplicated() == True])
print(df_data[df_data['full_text'].duplicated() == True])


### Cleaning and normalizing full text

In [None]:
# removing any phrases that are not related to the actual content
df_data['cleaned_text'] = df_data['full_text'].apply(lambda x: re.sub('The copyright.+preprint', '', x))
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: re.sub('All rights reserved. No reuse allowed without permission.', '', x))
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: re.sub('The copyright.+ author/funder', '', x))
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: re.sub('This article is a US Government work.+a CC0 license', '', x))


In [None]:
# Normalize the text by converting all letters to lower case
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: x.lower())
# Remove all punctuation and add spaces instead
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))
# Remove any excess white space including tabs and creating list of words in text
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: x.split())
# Lemmatizing words
lemmatizer = WordNetLemmatizer()
df_data['cleaned_text'] = df_data['cleaned_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
df_data['cleaned_text'][0]

### Doing the whole thing with TensorFlow


In [None]:
# convert the titles into a list
title_biorxiv = df_data['title'].tolist()
title_biorxiv

In [None]:
# load the universal sentence encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)


In [None]:
# get embeddings for the list of abstracts
def embed(input):
    return model(input)

title_embeddings = embed(title_biorxiv)
title_embeddings
