#  Initialisation de la base de donnée

Ce notebook a pour objectif de prendre la base de donnée et les metadatas, de les traiter pour noter les articles. Le but est de sauvegarder deux csv, un pour ceux avec texte complet et l'autre non. Ils contiennent des notes pour la date ainsi que les titres et abstracts transformés. Celui avec les textes complets contient aussi le nombre de fois où chaque article est cité.

Importation des dépendances

In [38]:
import numpy as np
import pandas as pd
import glob
import json
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from datetime import datetime

In [13]:
# à faire si besoin
nltk.download('wordnet') 
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juliettemontanteme/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliettemontanteme/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
root_path = '/Volumes/Samsung_T5/JULIETTE/data-CORD'
#astuce : pour trouver facilement le chemin, ouvrir le dossier correspondant 
#et en appuyant sur "alt" aller sur édition et copier le chemin
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def open_article(path):
    file=open(root_path+"/"+path,'r')
    article=json.load(file)
    file.close()
    return article

In [8]:
def make_text(article):
    text_list = []
    for entry in article['body_text']:
        text_list.append(entry['text'])
        text_list.append("\n")
    
    text_full=''.join(text_list)
    return text_full

## Date

In [49]:
def date(x):
    try:
        return int(x[:4]+x[5:7]+x[8:10])
    except TypeError:
        return np.NaN

def critère_date(date):
    try:
        cur_date = str(datetime.now())
        cur_date = int(cur_date[0:4] + cur_date[5:7] + cur_date[8:10])
        return 1/(cur_date-date+1)
    except ZeroDivisionError:
        return 0

In [50]:
meta_df["date"]=meta_df["publish_time"].apply(lambda x: date(x))
meta_df["note_date"]=meta_df["date"].apply(lambda x:critère_date(x))

In [51]:
meta_df["note_date"].head()


0    0.000005
1    0.000005
2    0.000005
3    0.000005
4    0.000005
Name: note_date, dtype: float64

## Transformation titre et abastract

In [14]:
lemmatizer=WordNetLemmatizer()
stop=stopwords.words('english')

def preprocessing(x): #tokenisation,stop-words,lemmatization ...
    clean = re.sub(r'['+string.punctuation + '’—”'+']', "", x.lower())
    clean_text= re.sub(r'\W+', ' ', clean)
    a=""
    for i in clean_text.split():
        if i not in stop :
            a+=str(lemmatizer.lemmatize(i))+' '
    return a

def preprocessing_title(x): #tokenisation,stop-words,lemmatization ...
    clean = re.sub(r'['+string.punctuation + '’—”'+']', "", x.lower())
    clean_text= re.sub(r'\W+', ' ', clean)
    a=""
    for i in clean_text.split():
        a+=str(lemmatizer.lemmatize(i))+' '
    return a

In [15]:
meta_df["title_process"]=meta_df["title"].apply(lambda x: preprocessing_title(str(x)))
meta_df["abstract_process"]=meta_df["abstract"].apply(lambda x: preprocessing(str(x)))

## Nombre de citations

In [16]:
def has_full_text(x):
    if x["pdf_json_files"]==True and x["pmc_json_files"]==True:
        return False
    return True

In [17]:
meta_bool=meta_df[["pdf_json_files","pmc_json_files"]].isnull()
meta_bool["has_full"]=meta_bool.apply(lambda x:has_full_text(x),axis=1)
meta_df_full_text=meta_df.loc[meta_bool["has_full"]]

In [18]:
meta_df_without_text=meta_df.loc[meta_bool["has_full"]==False]

In [19]:
def get_path(path):
    a=""
    for i in range (len(path)):
        if path[i]==";":
            break
        a+=path[i]
    return a

def add_ref(article):
    ref=article["bib_entries"]

    for i in ref.keys():
        title=preprocessing_title(ref[i]["title"])
        if title in set_ref:
            dico_ref[title]+=1
        else:
            dico_ref[title]=1
            set_ref.add(title)
            
def count_ref(x):
    try:
        path=x["pdf_json_files"]
        article=open_article(get_path(str(path)))
       
    except FileNotFoundError:
        path=x["pmc_json_files"]
        article=open_article(get_path(str(path)))
    add_ref(article)
    return "done"

def link_ref(x):
    title=x
    if title in set_ref:
        return dico_ref[title]
    return 0

def path(x):
    try:
        try :
            path=x["pdf_json_files"]
            article=open_article(get_path(str(path)))
            return get_path(str(path))
       
        except FileNotFoundError:
            path=x["pmc_json_files"]
            article=open_article(get_path(str(path)))
            return get_path(str(path))
    except FileNotFoundError:
        return ""

In [20]:
dico_ref=dict()
set_ref=set()
meta_df_full_text["ref"]=meta_df_full_text[["pdf_json_files","pmc_json_files"]].apply(lambda x: count_ref(x),axis=1)

KeyboardInterrupt: 

In [None]:
meta_df["nb_ref_linked"]=meta_df["title_process"].apply(lambda x: link_ref(x))

In [None]:
max_ref=meta_df["nb_ref_linked"].max()
meta_df["note_ref"]=meta_df["nb_ref_linked"].apply(lambda x:x/max_ref)
meta_df["path"]=meta_df[["pdf_json_files","pmc_json_files"]].apply(lambda x:path(x),axis=1)

In [None]:
meta_df_save=meta_df[["title_process","abstract_process","date","path","note_date","nb_ref_linked","note_ref"]]


In [None]:
meta_df_save.to_csv("metadata_processed.csv")
