# Import Part

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy as sp
nlp = sp.load('en_core_web_sm')

# Function Part

In [None]:
def cleanDate(df: pd.DataFrame, date_column: str):
    """
    Foramts dates in MM YYYY format
    
    In :
        df : dataFrame from Nature CSV articles
        date_column : name of the column containing the dates
    
    Out :
        N.A. (modification of the dataframe provided as input)
    
    """
    
    # Correction of dates
    corrDate = [('January', '01'),
                ('February', '02'),
                ('March', '03'),
                ('April', '04'),
                ('May', '05'),
                ('June', '06'),
                ('July', '07'),
                ('August', '08'),
                ('September', '09'),
                ('October', '10'),
                ('November', '11'),
                ('December', '12')]

    cleanDate = []
    date = np.array(df[date_column])
    
    for i in date:
        try:
            if type(int(i[0:2])) is int:
                dateTemp = i[3:]

        except:
            dateTemp = i

        for k, v in corrDate:
            if dateTemp.replace(k, v) != dateTemp:
                cleanDate.append(dateTemp.replace(k, v))

                
    if len(cleanDate) != 0:
        df[date_column] = cleanDate

In [None]:
def clean_text(col_text: pd.DataFrame) -> list:
    """
    Cleaning of the text from the dataframe provided

    In :
        col_text: the column containing the text

    Out :
        Give a column ( <=> list) containing the cleaned texts

    """
    
    col_clean_text = []
    for i in tqdm(col_text):
        text = nlp(i)
        text_clean = []

        for token in text:
            if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE'] and token.is_alpha == True:
                text_clean.append(token.lemma_)

        col_clean_text.append(" ".join(text_clean))
        
    return col_clean_text

In [None]:
def get_most_freq_words(tfidf_vect, mat_tfidf, n_terms: int) -> list: 
    """
    This function allows you to find the n_terms most important words ( <=> most frequent) of each article.

    In :
        mat_tfidf: tf_idf of articles
        n_terms: the number of desired terms
        
    Out :
        column with the most frequent words

    """
    
    freq_word = []
    labels = tfidf_vect.get_feature_names()
    
    for l in tqdm(range(mat_tfidf.shape[0])):
        ligne = pd.DataFrame(mat_tfidf[l].todense())
        
        for i,r in ligne.iterrows():
            freq_word.append([','.join([labels[t] for t in np.argsort(r)[-n_terms:]])])
            
    return freq_word

# Process Part

In [None]:
# CSV Loading
df_all = pd.read_csv("../../../Data/Nature/Not_Clean/articles_nature_V1.csv", sep=";")
df_all = df_all.drop(df_all.columns[[0]], axis='columns')
print("Load OK \n")

#Suppression des abstract à chaque début d'article
df_all['text'] =[i[8:] for i in df_all['text']]
print("Delete Abstract OK \n")

#lower sur la colonnr text
df_all['text'] = df_all['text'].str.lower()
print("lower texts OK \n")

# Prepro des dates
cleanDate(df_all, 'date')
print("Dates OK \n")

# Nettoyage du texte (30 min)
df_all['text'] = clean_text(df_all['text'])
print("Cleaning texts OK \n")

#TF-IDF
tfidf_vect = TfidfVectorizer(lowercase=False, stop_words=None)
X = tfidf_vect.fit_transform(df_all['text'])
print("TF-IDF OK \n")

#Mot clé par article (2 min)
df_all['key_words'] = get_most_freq_words(tfidf_vect, X, 5)
print("Keywords OK \n")

#Export du CSV pr import dataBase
df_all.to_csv('../../../Data/Nature/Clean/articles_nature_clean_V1.csv', sep = ";", index = True)
print("Export OK \n")


In [None]:
df_all[0:5]