Créé le 17 mars 2021

**Projet Tableau de Bord** 

**Groupe n°3 - Arnaques en ligne**

**Création des json pour l'insertion**
@authors:
- KIRED Nour Elhouda



#### Import libraries

In [None]:
########## Module import ##########
# Goole Colab
%load_ext google.colab.data_table
from google.colab import files

# Fichiers
import json

# Scraping
from requests import get
from bs4 import BeautifulSoup

# Format
import time
import json

# traitement texte /Nettoyage des données
import operator
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
from tqdm import tqdm
from gensim.utils import simple_preprocess
import spacy
import gensim
import gensim.corpora as corpora


nltk.download('stopwords')
stopWords=stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### Important des données 

In [None]:
uploaded = files.upload()
data_dict = json.loads(uploaded['clean_articles_5629.json'])

Saving clean_articles_5629.json to clean_articles_5629.json


## Traitement du texte

#### Partie fonctions

In [None]:

def cleanedWords(sentence : str)-> str:
    """
       Fonction qui supprime les caractères spéciaux
        In :
            texts : un texte
        Out :
            le texte sans caracteres speciaux 

    """
    return (re.sub("[^a-zA-Z0-9]"," ", sentence.lower()))


def remove_stopwords(texts : str) -> list:
    """
        Fonction qui supprime les mots vides du texte
        In :
            texts : 
        Out :
            le texte sans mots vides

    """
    return [word for word in simple_preprocess(texts) if word not in stopWords] 
        
def lemmatization(texts : list)-> list:
    """
        Fonction : Lemmatisation d'un texte

        In :
            texts : liste de mots
        Out :
            liste de mots lemmatisée 
    """
    texts =" ".join(texts)
    return [token.lemma_ for token in nlp(texts)]


#### nettoyage du body

- clusterisation par titre 

In [None]:
clean_text=[]
for (i,row)in tqdm(data_dict.items()):
    # enlever les caracteres speciaux
    data_words =cleanedWords(row['title'])
    # enlever les Stop Words
    data_words_nostops = [word for word in data_words.split() if word not in stopWords] 
    # effectuer une lemmatisation  
    data_lemmatized = lemmatization(data_words_nostops)
    clean_text.append(data_lemmatized)


100%|██████████| 5629/5629 [00:18<00:00, 304.62it/s]


In [None]:
# Creation d'un dictionnaire de tous les mots des titres
id2word = corpora.Dictionary(clean_text)
print("dict ok")

# creaction du corpus (texts) 
texts = clean_text
print("text ok")

# vectorisation de nos mots 
corpus = [id2word.doc2bow(text) for text in texts]
print("corpus ok")

del  texts

dict ok
text ok
corpus ok


## Création des thèmes

#### Entrainement d'un modele 

On a décidé de de laisser de themes :
- description des arnaques
- detection des fraudes'

In [None]:
## entrainement d'un modele qui va nous permettre de clusteriser nos 
## articles par themes
lda_model = gensim.models.LdaModel(corpus,
                                   num_topics=2,
                                   id2word=id2word,
                                   random_state=100,
                                   alpha='auto')

del id2word

In [None]:
def format_topics_sentences(ldamodel, corpus):
    """"Documentation
      Parameters:
            ldamodel : notre modele entrainé pour clusteriser nos articles par theme
            corpus : texts des articles
      Out :
            chaque article est associé à son theme (creation d'une column dans notre df)

    """
    # Iinitialisation de notre output df
    sent_topics_df = pd.DataFrame()
    # recupere le theme qui maximise le score pour chaque article
    nb_comm = len(corpus)
    list_topic = [int(sorted(lda_model[corpus][i], key=lambda x: (x[1]),
                             reverse=True)[0][0]) for i in tqdm(range(nb_comm))]
    sent_topics_df['Dominant_Topic'] = list_topic

    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=lda_model, corpus=corpus)

df_dominant_topic = df_topic_sents_keywords.reset_index()

df_dominant_topic.columns = ['id', 'Dominant_Topic']

del df_topic_sents_keywords,lda_model


100%|██████████| 5629/5629 [00:01<00:00, 3111.80it/s]


#### Mots les plus frequents par (theme/article)

**- Par article**

In [None]:
common_words_article=[]
for data in clean_text:
  vectorizer: TfidfVectorizer = TfidfVectorizer(stop_words=stopWords)
  try:
    X = vectorizer.fit_transform(list(data))
    vocabulary: list = vectorizer.vocabulary_
    common_words_article.append(sorted(vocabulary.items() ,key=operator.itemgetter(1), reverse=True )[:3])
  except:
    common_words_article.append([])

In [None]:
common_words_=[str(i) for i in common_words_article]
common_words_article=pd.DataFrame(common_words_,columns=['commons words/article_iter'])

In [None]:
df=pd.concat([common_words_article,df_dominant_topic['Dominant_Topic']],axis=1)

**- Par theme**

In [None]:
## recuperation de  tous les bodies pour chaque theme
new_data= {}
for index, row in df.iterrows():
  y=row['Dominant_Topic']
  if y not in new_data.keys():
    new_data[y]=clean_text[index]
  else:
    new_data[y]+=clean_text[index]

In [None]:
most_common_words_themes= {}
for (index,abstract) in new_data.items():
  split_it=[(re.sub("[^a-zA-Z0-9]", "", w.lower())) for w in abstract if w not in stopWords]
  # Pass the split_it list to instance of Counter class. 
  count = collections.Counter(split_it) 
  # most_common() produces k frequently encountered 
  # input values and their respective counts. 
  most_occur = count.most_common(6) 
  most_common_words_themes[index]=most_occur

In [None]:
types_dict={0:'detection des fraudes',
            1:'description des arnaques',
}

In [None]:
themes={}
for (i,row) in df.iterrows():
  themes[i]=types_dict[row['Dominant_Topic']]

#### Sauvegrade des données

In [None]:
with open('themes.json', 'w') as f:
    json.dump(themes, f)