# **USEFUL LIBRARIES**

In [35]:
import pandas as pd
import numpy as np
import  urllib.request
import urllib.parse
from bs4 import BeautifulSoup


import tinydb
from langdetect import detect
from pymed import PubMed

import requests
import re
import random
import glob
import os
import time

np.random.seed(42)

# **UTILITY METHODS**

* get_publication_ID
* create_list_of_keywords
* text2BoW

In [24]:
def extract_publication_info(publication):
    info = publication.toDict() #each element of the papers list is a dictionary class
    ID = info["pubmed_id"]
    if len(ID)>10:
        ID = ID.split('\n')
        ID = ID[0]
    title = info['title']
    if 'authors' in info:
      authors = info['authors']
    else:
      authors = []
    if 'keywords' in info:
      keywords = info['keywords']
    else:
      keywords = []
    if 'journal' in info:
      journal = info['journal']
    else:
      journal = []
    if 'doi' in info:
      doi = info['doi']
    else:
      doi = []

    return ID, title, authors, keywords, journal, doi


def create_list_of_keywords(filename, keyword_or_evidence = 'e'):
  fp = open(filename, 'r')
  lines = fp.readlines()
  kw_list = []

  for line in lines:
    if keyword_or_evidence == 'e':
      x = line.split('|')
      x = x[1]
      x = x.split('\n')
    else:
      x = line.split('\n')
    kw_list.append(x[0])
  
  fp.close()
  return kw_list

def text2BoW(processed_docs):

  # Converting text to Bag of Words (BoW)
  dictionary = gensim.corpora.Dictionary(processed_docs)
  # Remove very rare and very common words
  #dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n = 100000)
  # Create the Bag-of-words for each abstract reporting how many words 
  # and how many times those words appear
  bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

  return dictionary, bow_corpus

def study_type_keywords(file_path):

  studytype_dict = {}

  # Go to all the folders that have names from 1 to 7
  for file_name in range(1,8):
    file_path_2 = os.path.join(file_path, str(file_name))
    all_txt_files = os.listdir(file_path_2)

    # Go through all txt files in the folder
    for txt_file in all_txt_files:

      # Take the name without .txt extension
      txt_file_name = txt_file[:-4]
      # If this key doesn't exist add it and assign it with an empty list
      if txt_file_name not in studytype_dict:
        studytype_dict[txt_file_name] = []

      txt_file_path = os.path.join(file_path_2, txt_file)

      # Read the lines from one .txt file
      with open(txt_file_path) as f:
        lines = f.readlines()
      
      # Take one line and append it to the list
      for line in lines:
        if line[:2] == '--':           # Some .txt files start with --SMTH--
          continue
        # Remove '\n' from each line
        line = line[:-2]
        # If the keyword doesn't exist in the list add it
        if line not in studytype_dict[txt_file_name]:
          studytype_dict[txt_file_name].append(line.lower())

  return studytype_dict
  


# **PUBLICATION CLASS**

This class containes following methods:
* display_abstract()
* count_overall_occuracies()
* KeyWords_found_in_abstract()
* number_of_different_keywords
* study_quality

The publication is defined by its **url** address. We are using only the publications' **abstracts** since they are always available in **PubMed** database. We have two textual files that contain **keywords** which we used to score the publication according to its connection with our interest domain. We would like to give higher scores to the articles that are about serious applications for kids.   

In [25]:
class Publication:
  
  def __init__(self, in_url):

    """ Takes URL of the publication, takes its abstract in the original 
        format, extracts only the useful text from the abstract.  """ 
    
    self.url = in_url

    # Given the URL we extract the abstract in the original format
    try:
      page = urllib.request.urlopen(self.url)
      time.sleep(6)
    except requests.exceptions.ConnectionError:
      r.status_code = "Connection refused"
    
    soup = BeautifulSoup(page, 'lxml')
    article_abs = soup.find(id = "abstract")

    # Sometimes we can't find the publication -> print '-'
    try:
      self.abstract = article_abs.get_text()
    except AttributeError:
      self.abstract = ''
      print('-')

    # Take only the useful text from the abstract -> remove its subparts, '\n' signs... 
    self.extract_abstract_text()

    ####################### Check if this is useful ############################
    self.EvidenceLevel = create_list_of_keywords('keywords_studydesign.txt')
    self.KeyWords = create_list_of_keywords('keywords_context.txt', 'k')
    ############################################################################

  def display_abstract(self): 
    print(self.abstract)

 
  def count_overall_occurancies(self):

    """ Takes our manually selected keywords and counts their occurances 
        in the abstract. Returns the sum of all occurances.  """
    
    abs_txt = self.abstract
    # Convert all big letters into low letters
    abs_txt = abs_txt.lower()  
    sum_txt = 0
    for word in self.KeyWords:
        count = abs_txt.count(word)
        sum_txt += count
    return(sum_txt)
  
  
  def KeyWords_found_in_abstract(self):
    
    """ Takes our manually selected keywords and checks their existence in 
        the abstract. Returns the list of founded keywords.  """
  
    abs_txt = self.abstract
    kw = []
    for word in self.KeyWords:
        if word in abs_txt.lower():
            kw.append(word)
    return(kw)

  
  def number_of_different_keywords(self):
    return(len(self.KeyWords_found_in_abstract()))

  
  def study_quality(self):
    abs_txt = self.abstract
    abs_txt = abs_txt.lower()
    sd = []
    for word in self.EvidenceLevel:
      word_low = word.lower()
      if word in abs_txt or word_low in abs_txt:
        sd.append(word)
    return(sd)

  
  def study_score(self):
    total_num = self.number_of_different_keywords()
    sd = self.study_quality()
    score = round(len(sd)/(total_num + 1),5)
    return score

  
  def manual_score(self):
    total_num = self.number_of_different_keywords()
    kw = self.KeyWords_found_in_abstract()
    score = round(len(kw)/(total_num + 1),5)
    return score

  
  def extract_abstract_text(self):

    #print('Abstract before: \n')
    #print(self.abstract)
    #print('\n')

    # Remove all \n signs
    self.abstract = self.abstract.replace('\n', '') 
    # Remove the Abstract word in the beginning of each article
    self.abstract = self.abstract.replace('Abstract', '')
    # Remove multiple spaces if they exist
    self.abstract = " ".join(self.abstract.split())

    # Remove the Keywords section
    self.abstract = re.sub(r"(?i)(?:Keywords:).*?[.?!]", '', self.abstract)

    # Find subparts such are Conclusion, Methods... and remove them
    subparts = self.find_article_subparts()
    for s in subparts:
      self.abstract = self.abstract.replace(s, '')

    # Remove all types of url addresses
    self.abstract = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', self.abstract)

    # Remove the space before the first string
    self.abstract.lstrip()
    
    #print('Abstract after: \n')
    #print(self.abstract)

  def get_keywords(self):
    self.keywords = re.findall(r"(?i)(?:Keywords:).*?[.?!]", self.abstract)
    if len(self.keywords)==0:
      self.keywords = []
    else:
      self.keywords = self.keywords[0]
      self.keywords = self.keywords.split(';')
      self.keywords[0] = self.keywords[0].replace('Keywords:', '')
    return self.keywords

  def find_article_subparts(self):
    subparts = re.findall(r'(\w+:)', self.abstract)
    return subparts

 


# **PUBLICATIONS CLASS**

In [26]:
class PubMedPublications:

   def __init__(self, number_of_articles = 10,             
                tool = 'ehealth_group14', 
                pubmedurlbase = "https://pubmed.ncbi.nlm.nih.gov/",
                email = "iva97.ja@gmail.com"):
    
    self.tool = tool
    self.email = email
    self.num_articles =  number_of_articles
    self.pubmedurlbase = pubmedurlbase

    self.pubmed = PubMed(tool = self.tool, email = self.email)
    self.abstracts = []
    self.article_dict = {}
    self.abs_IDs = set()

   def search(self, query_words):

     # Initialize empty lists for all info
     IDs = []
     titles = []
     kws = []
     journals = []
     dois = []
     authorss = []
     
     for query_word in query_words:

       # Get best N articles for given query
       try:
         n_article_results = self.pubmed.query(query_word, max_results = self.num_articles)  
       except TypeError:
         pass
       time.sleep(10)
      
       for one_article_result in n_article_results:
 
         # Get article INFO and abstract text
         article_ID, title, authors, keywords, journal, doi = extract_publication_info(one_article_result)

         # Do this only if this ID is new
         if article_ID not in self.abs_IDs:
           self.abs_IDs.add(article_ID)
           IDs.append(article_ID)
           titles.append(title)
           kws.append(keywords)
           journals.append(journal)
           dois.append(doi)
           authorss.append(authors)

           article_abstract = Publication(self.pubmedurlbase + article_ID + "/")
           

           ########################## Check this part ############################

           # Occurance of all keywords in the article 
           all_kw_occurance = article_abstract.count_overall_occurancies()
           # Keywords that appear in the article
           kw = article_abstract.KeyWords_found_in_abstract()
           # Number of keywords that don't appear in the article
           nb_diff = article_abstract.number_of_different_keywords()
           # Article score
           sd = article_abstract.manual_score()
 
           #######################################################################

           # Dictionary that conatines abstract ID and its score
           if article_ID not in self.article_dict:
             self.article_dict[article_ID] = sd

             # Add that abstract text to the global list of abstracts
             self.abstracts.append(article_abstract.abstract)
     
     return self.abstracts, IDs, titles, kws, journals, dois, authorss




# **MAIN**



In [None]:
# Make a dictionary of articles IDs and thier scores for the given query, as well
# as the list of partialy-processed abstract texts
all_publications = PubMedPublications(number_of_articles = 20)
queries = ['apps kids', 'serious games', 'brain kids apps', 'school games',
           'teaching', 'game mechanisms', 'learning', 'study skills' ]
all_publications.search(query_words = queries)

In [None]:
# Shuffle the abstracts for better generalization
random.shuffle(all_publications.abstracts)

# Split the abstracts into training and test sets
dataset_size = len(all_publications.abstracts)
train_size = int(dataset_size*0.7)
test_size = dataset_size - train_size

train_abstracts = all_publications.abstracts[:train_size]
test_abstracts =  all_publications.abstracts[train_size:]


In [None]:
# Example of one partialy-processed abstract text
all_publications.abstracts[0]

# *Text Information Extraction & Topic Modelling*

Useful links:
* https://blog.aureusanalytics.com/blog/5-natural-language-processing-techniques-for-extracting-information
* https://towardsdatascience.com/something-from-nothing-use-nlp-and-ml-to-extract-and-structure-web-data-3f49b2f72b13

# **LDA** - Latent Dirichlet Allocation

Useful links:
* https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc
* https://www.youtube.com/watch?v=HnnKHP6s-n0
* https://towardsdatascience.com/clustering-documents-with-python-97314ad6a78d
* https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
* https://www.freecodecamp.org/news/how-we-changed-unsupervised-lda-to-semi-supervised-guidedlda-e36a95f3a164/
* https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [14]:


import nltk
nltk.download('wordnet')
from nltk import tokenize
from nltk import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from tqdm.notebook import tqdm
tqdm.pandas()

import gensim
from gensim import corpora, models
from gensim.utils import tokenize


[nltk_data] Downloading package wordnet to C:\Users\Phillip
[nltk_data]     Maya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
class TextPreprocessing:

  def __init__(self, abs_text):
    self.abs_text = abs_text

  def preprocess(self):
    self.split_abstract_into_words()    
    self.remove_short_words()   
    self.remove_stop_words()   
    self.lemmatize_stemming()

    #print(self.tokenized_words)
    #print(self.words)
    #print(self.words_without_stop)
    #print(self.words_final_form)

    return self.words_final_form

  def split_abstract_into_words(self):
    self.tokenized_words = list(gensim.utils.tokenize(self.abs_text))

  def remove_short_words(self, thresh = 3):
    self.words = []
    for word in self.tokenized_words:
      if len(word)>=thresh:
        self.words.append(word)


  def remove_stop_words(self):
    self.words_without_stop = []
    for w in self.words:
      if w not in gensim.parsing.preprocessing.STOPWORDS:
        self.words_without_stop.append(w)


  def lemmatize_stemming(self):
    stemmer = PorterStemmer()
    self.words_final_form = []
    for w in self.words_without_stop:
      new_w = stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v'))
      self.words_final_form.append(new_w)





In [38]:
# Go through all of the abstracts and preprocess them
processed_docs = []
for abstract in train_abstracts:
  processing_abs = TextPreprocessing(abstract)
  result = processing_abs.preprocess()
  processed_docs.append(result)

#Create a dictionary from 'processed_docs' containing the number of times a 
# word appears in all of the abstracts (training dataset)
dictionary, bow_corpus = text2BoW(processed_docs)



# Define the LDA model - suppose the number of topics (here it is 8)
LDA_model =  gensim.models.LdaMulticore(bow_corpus, 
                                        num_topics = 5, 
                                        id2word = dictionary,                                    
                                        passes = 10,
                                        workers = 2)

# For each topic, we will explore the words occuring in that topic 
# and its relative weight
for idx, topic in LDA_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

NameError: ignored

# **Abstract Classifocation Topics**


1.   Class 1
2.   Class 2
3.   Class 3
4.   Class 4
5.   Class 5
6.   Class 6
7.   Class 7
8.   Class 8




# **Classification on New Articles**

The highest score refers to which class the article belongs. Here we can use new articles to evaluate the model trained on the training set previously defined. Since the problem is not supervised we had to analyse these 8 classes and give them proper names. 

In [None]:
processing_abs = TextPreprocessing(test_abstracts[0])
result = processing_abs.preprocess()

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(result)


for index, score in sorted(LDA_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, LDA_model.print_topic(index, 5)))

# **VISUALISATION**

In [None]:
%matplotlib inline
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


In [None]:
vis = gensimvis.prepare(topic_model=LDA_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

# **STUDY TYPE CLASSIFICATION**

Given the "*Study Type Dictionary*" folder which containes keywords for each of the following 8 study types: 
* *MetaAnalysis.txt*
* *ObservationalStudy.txt*
* *RCT.txt*
* *SystematicReview.txt*
* *CaseControl.txt*
* *CaseSeries.txt*
* *CohortStudy.txt*
* *Other.txt*


There are *7* folders where each contains aforementioned textual files and many of keywords are repeat when comparing e.g. the *MetaAnalysis.txt* from the folders *1* and *2*. The idea is to go through all of this folders and files and gather unique keywords that will represnt each one of the study types.Looking at the provided keys, they are already lemmatized. Therefore, we don't need to preprocess them too.


In [28]:
#!unzip "Study type dictionaries"

path = "StudyType_Dictionaries"
study_type_dictionary = study_type_keywords(path)

# The keys in the dictonary
print('Dictionary keys:')
print(study_type_dictionary.keys(), '\n')

# List of keywords for one key e.g. MetaAnalysis
print('MetaAnalysis keywords:')
print(study_type_dictionary['MetaAnalysis'],'\n')

# Number of classes
print('Number of study types: {}'.format(len(study_type_dictionary.keys())))

Dictionary keys:
dict_keys(['MetaAnalysis', 'ObservationalStudy', 'RCT', 'SystematicReview', 'Other']) 

MetaAnalysis keywords:
['engagement inde', 'cross-classification analysi', 'pooled sm', '4-level kirkpatrick mode', 'cross-sectional surve', 'metaprop random effects analysi', 'metaprop fix effect analysi', 'meta-analytic revie', 'review of published studie', 'meta-ethnograph', 'meta-analysi', 'meta synthese', 'meta descriptio', 'meta', 'meta-analyse', 'meta-analyti', 'meta analyse', 'meta analyti', 'meta-descriptio', 'meta-synthese', 'meta-evaluatio', 'meta evaluatio', 'meta analysi', 'multivariate analysi', 'hypothesis test ', 'statistical tes', 'evaluation stud', 'statistical method', 'synthesize dat', 'evaluate dat', 'regression analys', 'cross-classification analysi', 'cross-sectiona', 'meta-analytic revie', 'review of published studie', 'meta-ethnograph', 'meta-analysi', 'meta-analysi', 'meta synthese', 'meta descriptio', 'meta-analyse', 'meta-analyse', 'meta-analyti', 'meta a

## *Publications for DataBase Apps*

Now, we want to go through all of the games we have in our database. Taking the games' names we will query the PubMed database and extract couple of publications for each one of them. When the publications are collected, they will be preprocessed as we did before in the code. Finally, in each of the abstract we will count the occurances of keywords for each study plan independently. The result will be the perentage of abstract belonging to one of the classes that represent particular study type. The abstract will be assigned with a class for which it has the highest probability of belonging to.

In [29]:
from tinydb import TinyDB
db = TinyDB('Ourdatabase.json')

In [30]:
def getFieldData(fieldName, db):
  result = [r[fieldName] for r in db]
  return result

game_names = getFieldData('title', db)
print('Game names: \n', game_names)

Game names: 
 ['Udemy - Online Courses', 'Learn 33 Languages Free - Mondly', 'Yandex.Translate', 'Star Chart', 'Simpler — выучить английский язык проще простого', 'Khan Academy', 'ELSA - Learn English Speaking', 'Aprender Inglés Gratis!', 'Sololearn: Learn to Code (Python, Javascript, etc)', 'PlantNet Plant Identification', 'Toca Kitchen', 'Quizizz: Play to learn', 'U-Dictionary: Translate Now', 'Current Affairs 2020 General Knowledge Quiz', 'How to Draw - Easy Lessons', 'Peak – Brain Games & Training', 'NCERT Books', 'ClasseViva Famiglia', 'ANTON: Kindergarten - Grade 5', 'Edmodo', 'Solar System Scope', 'Language Learning - Spanish, Korean, French & More', 'myCBSEguide - CBSE Papers & NCERT Solutions', 'Meritnation: CBSE, ICSE & more (Free Live Classes)', "BYJU'S – The Learning App", 'شعلة - درّب عقلك يومياً', 'Hello English: Learn English', 'Remind: School Communication', 'Simply Piano by JoyTunes', 'Learn English Words Free', 'EWA: Learn English & Spanish Language', 'Vedantu: LIVE L

Since the game names are not very suitable for querying we need to preprocess them. Firstly, they usually contain more words and we can notice two main parts of the name (one before signs **' - '** or **' : '**, another after them). Some of them are writen in the language different from English. These are some of the examples:

* *Learn 33 Languages Free - Mondly*
* *Sololearn: Learn to Code (Python, Javascript, etc)*
* *U-Dictionary*
* *ANTON: Kindergarten - Grade 5*
* *Language Learning - Spanish, Korean, French & More*
* *Meritnation: CBSE, ICSE & more (Free Live Classes)*
* *تعليم اللغة الانجليزية من الصفر بالصوت والصورة*
* *wifistudy - #1 Exam Preparation, Free Mock Tests*
* *Exam Preparation App: Live Class | Mock Test | PYP*

\\

Since there are names that are not in English, first, we will check the language. If the language is English in that case we can split the name into shorter textual parts. Otherwise, we won't modify the name. Now, we need to decide how to split the names as those parts as queries too.

In [31]:
queries_dict = {}

for name in game_names:

  if name not in queries_dict:
    queries_dict[name] = []

  # First add the whole name
  queries_dict[name].append(name)
  if name.isnumeric()==True:
    continue
  if detect(name) != 'en':
    queries_dict[name].append(name)
  
  else:
    # Split the name when ' - ' is present
    sub_names = name.split(' - ') 
    queries_dict[name].extend(sub_names)
    continue

    # Split the name when ':' is present
    sub_names = name.split(':') 
    queries_dict[name].extend(sub_names)
    continue

    # Split the name when '|' is present
    sub_names = name.split('|') 
    queries_dict[name].extend(sub_names)
    continue


# Unique list elements
for key in queries_dict:
  queries_dict[key] = list(set(queries_dict[key]))


In [32]:
def count_occurances_study_type_keywords(abstract, keywords_dict):
  
  """ Given one abstract in semi-processed format and the Study type 
      keywords dictiory previously obtained """
  
  result_dict = {}
  counts_total = 0
  for study_type in keywords_dict:
    counts = 0
    for word in keywords_dict[study_type]:
      counts += abstract.count(word)
    counts_total += counts
    result_dict[study_type] = counts

  result_dict = {k: round(v/(counts_total+0.01), 4) for k, v in result_dict.items()}

  return result_dict

In [37]:
from urllib.error import HTTPError

dict_game_score = {}
num_pubs_per_game = {}
dict_game_papers = {}

# Go through all the games 
for name in game_names:
  all_publications = PubMedPublications(number_of_articles = 10)
  # Take queries for one particular game
  queries = queries_dict[name]
  # Gather the abstracts
  abstracts, IDs, titles, kws, journals, dois, authorss  = all_publications.search(query_words = queries)
  # Extract information (ID, Name, Authors, Year, Study type, DOI, keywords)
  data = {}
  data['ID'] = IDs
  data['Title'] = titles
  data['Keywords'] = kws
  data['Journal'] = journals
  data['DOI'] = dois
  data['Authors'] = authorss

  df = pd.DataFrame.from_dict(data)
  dict_game_papers[name] = df

  #print(name, len(abstracts))
  num_pubs_per_game[name] = len(abstracts)
  results_abs = []
  for abstract in abstracts:
    result = count_occurances_study_type_keywords(abstract.lower(), study_type_dictionary)
    results_abs.append(result)
  
  dict_game_score[name] = results_abs


FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [None]:
dict_game_score['100 Doors 2013']

[{'MetaAnalysis': 0.0714,
  'ObservationalStudy': 0.1071,
  'Other': 0.0,
  'RCT': 0.8211,
  'SystematicReview': 0.0},
 {'MetaAnalysis': 0.0,
  'ObservationalStudy': 0.3633,
  'Other': 0.0,
  'RCT': 0.6358,
  'SystematicReview': 0.0}]

In [None]:
import pickle
with open('dict_game_scores.pickle', 'wb') as handle:
    pickle.dump(dict_game_score, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('number_pubs_per_game.pickle', 'wb') as handle:
    pickle.dump(num_pubs_per_game, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('pubs_per_game.pickle', 'wb') as handle:
    pickle.dump(dict_game_papers, handle, protocol=pickle.HIGHEST_PROTOCOL)

# **INFORMATION EXTRACTION**