# Import Part

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy as sp
nlp = sp.load('en_core_web_sm')

# Function Part

In [None]:
def clean_date_science(df: pd.DataFrame, date_column: str) -> list:
    """
    Formatting dates in MM YYYY formats
    
    
    In :
        df: the dataFrame from CSV Nature
        date_column: the name of the column containing the dates
    
    Out :
        date column
    
    """
    col_date = df[date_column]
    nb_date = len(col_date)
    
    
    # On ne garde que le mois (ou période) et l'année + séparation des dates en 2 parties (exp : Nov-Dec)
    col_date = [" ".join(i.split("–")[-1].split()[-2:]) for i in col_date]
    
    
    
    # On remplace les valeurs qui ne sont pas des dates (dernier élément =! int) par des vides
    for i in range(nb_date):
        if len(col_date[i].split()) == 2 :  
            try:
                if type(int(col_date[i].split()[-1])) is int:
                    pass

            except:
                col_date[i] = ""
        else:
            col_date[i] = ""
               
    
    return col_date

In [None]:
def clean_text(col_text: pd.DataFrame) -> list:
    """
    Cleaning the text column from the provided dataframe

    In :
        col_text: the column of provided dataframe with the text

    Out :
        Give column ( <=> list) with the cleaning text

    """
    
    col_clean_text = []
    for i in tqdm(col_text):
        text = nlp(i)
        text_clean = []

        for token in text:
            if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE'] and token.is_alpha == True:
                text_clean.append(token.lemma_)

        col_clean_text.append(" ".join(text_clean))
        
    return col_clean_text

In [None]:
def get_most_freq_words(tfidf_vect, mat_tfidf, n_terms: int) -> list: 
    """
    This function allows you to find the n_terms most important words ( <=> most frequent) of each article.

    In :
        mat_tfidf:  tf_idf of articles
        n_terms: the wanted number of term
        
    Out :
        the column with the most frequent words

    """
    
    freq_word = []
    labels = tfidf_vect.get_feature_names()
    
    for l in tqdm(range(mat_tfidf.shape[0])):
        ligne = pd.DataFrame(mat_tfidf[l].todense())
        
        for i,r in ligne.iterrows():
            freq_word.append([','.join([labels[t] for t in np.argsort(r)[-n_terms:]])])
            
    return freq_word

# Process Part

In [None]:
# Loading CSV
df_all = pd.read_csv("../../../Data/Science Direct/Not_Clean/articles_sciencedirect_V0.csv", sep=";")
df_all = df_all.drop(df_all.columns[[0]], axis='columns')
print("Chargement OK \n")

# lower on text column
df_all['text'] = df_all['text'].str.lower()
print("lower texts OK \n")

# Preprocess of dates
df_all['date'] = clean_date_science(df_all, 'date')
print("Dates OK \n")

# Separation of the text column into its 3 components: highlights, abstract, keywords 
# + We remove the abstract etc. at the beginning.
df_all = pd.concat([df_all, 
                    pd.DataFrame([i[1].split('%%$%%') for i in df_all['text'].iteritems()], 
                                 columns = ['highlight', 'abstract', 'keyword'], 
                                 index = df_all.index)], 
                   sort=False, axis = 1)

df_all['highlight'] = [i[11:] for i in df_all['highlight']]
df_all['abstract'] = [i[8:] for i in df_all['abstract']]
print("Separation text OK \n")


# Cleaning of highlights (10 min)
df_all['highlight'] = clean_text(df_all['highlight'])
print("Cleaning highlight OK \n")

# Cleaning of abstracts (75 min)
df_all['abstract'] = clean_text(df_all['abstract'])
print("Cleaning abstract OK \n")

# TF-IDF for articles without keyword sections (on highlight + abstract grouped together)
tfidf_vect = TfidfVectorizer(lowercase=False, stop_words=None)
X = tfidf_vect.fit_transform(df_all[df_all['keyword']==""]['highlight'] + df_all[df_all['keyword']==""]['abstract'])
print("TF-IDF OK \n")

# Keyword per article (1 min)
df_all[df_all['keyword']==""]['keyword'] = get_most_freq_words(tfidf_vect, X, 5)
print("Keywords OK \n")

# Export CSV
df_all.to_csv('../../../Data/Science Direct/Clean/articles_sciencedirect_clean_V0.csv', sep = ";", index = True)
print("Export OK \n")


In [None]:
df_all[0:5]