Created on Wednesday 06 January 2021

Group 5 - Classification

@author : Abdou SOW

# Create link between drive and notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Download

In [None]:
!python3 -m spacy download fr_core_news_md

# Imports

In [None]:
import pandas as pd 
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import spacy
import fr_core_news_md
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import warnings
warnings.filterwarnings('ignore')

In [None]:
cd /content/drive/My\ Drive/Colab\ Notebooks/G5\ Inter-Promo\ 2021

/content/drive/.shortcut-targets-by-id/1ykBO6hTtGVNV3ua83L9Q2wSAOlRCk_7k/G5 Inter-Promo 2021


In [None]:
nlp = fr_core_news_md.load()

# Load data

In [None]:
data = pd.read_json("Données/Input/df_final_clean.json")

In [None]:
df_lexique = pd.read_csv("Ressources/Copie_de_Lexique_Innovation.txt", sep="  ", header=None)
df_lexique.columns = ['key_words']

In [None]:
df_lexique_gam = pd.read_csv("Ressources/Copie de Lexique_Gammes_Gestion.txt", sep="  ", header=None)
df_lexique_gam.columns = ['key_words']

# Functions

In [None]:
stop_words = set(stopwords.words('french'))
def preprocessing(text: str) -> list :
  """Documentation
    Parameter:
        text: text of an article
    Out:
        mylist: list of clean words
  """
  words = word_tokenize(text)
  for i in range(len(words)):
    words[i] = words[i].lower()
  pattern = ['(', ')', ':', ';', ',', '&', '/', '"', "’",'\n','©','n°','-','.','»','«','cette','cet']
  mylist = []
  for w in words :
    if (w not in stop_words) and (w not in pattern) :
      mylist.append(w)
  return mylist

In [None]:
def sentence(text: str) -> str :
  """Documentation
    Parameter:
        text: text of an article
    Out:
        text: clean sentence without stopwords and pattern
  """
  clean_list = preprocessing(text)
  return ' '.join(clean_list)

In [None]:
def supprime_accent(ligne: str) -> str:
  """Documentation
    Parameter:
        ligne: clean text of an article
    Out:
        ligne: clean text without accent
  """
  accents = { 'a': ['à', 'ã', 'á', 'â'], 'e': ['é', 'è', 'ê', 'ë'], 'i': ['î', 'ï'], 'u': ['ù', 'ü', 'û'], 'o': ['ô', 'ö'], 'c': ['ç'] }
  for char, accented_chars in accents.items():
    for accented_char in accented_chars:
      ligne = ligne.replace(accented_char, char)
  return ligne

In [None]:
Caractere=re.compile('\W+')
def function_lemmatizer(text: str) -> str :
  """Documentation
    Parameter:
        text: clen text without accent
    Out:
        text: clean text lemmatized
  """
  txt = Caractere.sub(' ',supprime_accent(sentence(text)))
  list_lemma = []
  document = nlp(txt)
  for token in document :
    list_lemma.append(token.lemma_)
  return ' '.join(list_lemma)

In [None]:
# Counts the number of words
def nb_word(text: list) -> int:
    """Documentation
      Parameters:
        text: texts of the article

      Out (if exists):
        nb_word: number of word in  the document
    """
    nb_words: list = []
    nb: int = 0
    # browse through the different texts
    for i in text:
        # removes special characters
        i.replace(',', ' ')
        i.replace('.', ' ')
        i.replace('!', ' ')
        i.replace('?', ' ')
        i.replace('/', ' ')
        # creates a list with all the words present in the text
        list_words: list = i.split()
        # counts the number of words present in the text
        nb_words.append(len(list_words))
    return nb_words

In [None]:
def tri_list(liste : list) -> list :
  """Documentation
    Parameter:
        liste: list of key words
    Outs:
        list_mot_unique: list of key words unique
        list_mot_compose: list of key words composed
  """
  list_mot_unique=[]
  list_mot_compose=[]
  for elem in liste :
    cpt = 0
    for car in elem :
      if (car == " ") :
        cpt+=1
    if cpt==0 :
      list_mot_unique.append(elem)
    else :
      list_mot_compose.append(elem)
  return (list_mot_unique,list_mot_compose)

# Execution

In [None]:
df_lexique['key_words_lemma'] = df_lexique['key_words'].apply(function_lemmatizer)
df_lexique_gam['key_words_lemma'] = df_lexique_gam['key_words'].apply(function_lemmatizer)


In [None]:
nb_word_lexique = df_lexique['key_word_lemma'].count()
nb_word_lexique

In [None]:
#Création dataframe des articles
df_articles_lemma = pd.DataFrame(columns = ['art_lemma_id'])
df_articles_lemma['art_lemma_id'] = data['art_id']
df_articles_lemma['art_lemma'] = data['art_content'].apply(function_lemmatizer)
df_articles_lemma['nb_art_lemma'] = nb_word(df_articles_lemma['art_lemma'])

In [None]:
liste_lexique = df_lexique['key_word_lemma'].tolist()
list_mot_unique,list_mot_compose = tri_list(liste_lexique)

In [None]:
df_word_unique_compose = pd.DataFrame(columns = ['word_unique','word_compose'])
df_word_unique_compose['word_unique'] = list_mot_unique
df_word_unique_compose['word_compose'] = list_mot_compose

# Save

In [None]:
df_lexique.to_json("Données/Output/Innovation/df_lexique_lemma.json")

In [None]:
df_lexique_gam.to_json("Données/Output/Innovation/df_lexique_gammes_gestion.json")

In [None]:
df_articles_lemma.to_json("Données/Output/Innovation/df_articles_lemma.json")

In [None]:
df_word_unique_compose.to_json("Données/Output/Innovation/df_word_unique_compose.json")

# TEST

In [None]:
df_word_unique_compose

In [None]:
df_articles_lemma['art_lemma'][27]

In [None]:
df_lexique.head(20)

In [None]:
df_lexique_gam.head(20)

In [None]:
supprime_accent('interopérabilité')

In [None]:
article = data['art_content'][2]

In [None]:
print(article)
print(sentence(article))
print(function_lemmatizer(article))

In [None]:
print(list_mot_compose,len(list_mot_compose))
print(list_mot_unique,len(list_mot_unique))

In [None]:
print(stop_words)