Created on Monday 05 January 2021

**Group 5 - Classification**  
**g5_function_visual_extraction**

@authors : J.J.

# Import 

In [133]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import plotly.express as px
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS
from sklearn.neighbors import LocalOutlierFactor
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import of Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [96]:
data: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data_With_Features_Syntax.csv")
data = data.query("art_lang == 'fr'")
data.drop(["art_id","art_content","art_content_html","art_extract_datetime","art_lang","art_title",
           "art_url","src_name","src_type","src_url","src_img","art_auth","art_tag","title_postive_score","title_negative_score",
           "title_polarity_score","title_subjectivity_score","netloc.com","ratio_word_title_on_word"],axis = 1, inplace = True)
data["average_word_sentence"] = data["average_word_sentence"].fillna(0)

data_normal = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data.json")
TF_IDF: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/TF-IDF.csv")
TF_IDF.drop(TF_IDF[data_normal['art_lang'] != 'fr'].index, inplace=True)

bow: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/BOW.csv")
bow.drop(bow[data_normal['art_lang'] != 'fr'].index, inplace=True)

# Function

In [101]:
def preprocessing(sentence : str) -> str:
  """Documentation
    Parameters In:
        data : 1 column of a dataframe especially choose because we want a column who contain only text

    Parameters Out :
        liste_return: liste of all the text of the dataframe return with preprocessing applied
  """
  stop_words = set(stopwords.words('french')) 
  pattern = ['(', ')', ':', ';', ',', '&', '/', '"', "'",'\n','©','n°','-']
  if not isinstance(sentence,str):
      sentence  = str(sentence)
    
  sentence = re.sub(r'\d',' ',sentence)
    
  sentence = sentence.lower()

  for d in pattern:
      sentence = sentence.replace(d," ")
    
  token_words = word_tokenize(sentence)
  filtre = [w for w in token_words if not w in stop_words]

  final_list = ""
  for word in filtre:
    final_list = final_list + word + " "
  return(final_list)

In [102]:
def generate_wordcloud_article(text):
  """Documentation
    Parameters In:
        text : the article that will be show on the word cloud
  """
  text = preprocessing(text)
  if len(text)>0:
    wordcloud = WordCloud(relative_scaling = 1.0,).generate(str(text))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    plt.close()
  else:
    print("The article have a size of 0.")

In [103]:
def generate_wordcloud_list_article(text):
  """Documentation
    Parameters In:
        text : the list of articles that will be show on the word cloud
  """
  list_articles = ""
  for i in range(len(text)):
    list_articles = list_articles + text[0]
  generate_wordcloud_article(text)

In [104]:
def graph_2d(data,prediction,dic_values,transparency = 0.15):
    """Documentation
    Parameters In:
        data : the data that will be show on the plot
        prediction : the prediction that will be used to make distinction bewteen the differents class on the plot
        dic_values : the signification of every values in the prediction
        transparency : the transparency of the points on the graph
    """
    sc = StandardScaler()
    X = sc.fit_transform(data)

    pca = PCA(n_components=10)
    X = pca.fit_transform(X)

    plt.title('Titre')
    for name_modal in dic_values:
      plt.scatter(X[prediction == dic_values[name_modal], 0], X[prediction == dic_values[name_modal], 1], alpha = transparency, label= name_modal)
    plt.legend()

In [125]:
def graph_3d(data,prediction, axes=(0,1,2), opacity=1):
    """Documentation
    Parameters In:
        data : the data that will be show on the plot
        prediction : the prediction that will be used to make distinction bewteen the differents class on the plot
        dic_values : the signification of every values in the prediction
        transparency : the transparency of the points on the graph
    """
    sc = StandardScaler()
    X = sc.fit_transform(data)

    pca = PCA(n_components=10)
    X = pca.fit_transform(X)

    data_plot = pd.DataFrame(X)
    data_plot["prediction"] = prediction
    fig = px.scatter_3d(data_plot, x=axes[0], y=axes[1], z=axes[2], color='prediction', opacity=0.5)
    fig.show()

# Exemple of use of the function

In [154]:
dic_values = { "normal" : 1 , "attack" : -1}

**LOF**

In [155]:
clf = LocalOutlierFactor(n_neighbors=3, contamination = 0.1)
y = clf.fit_predict(bow)
pred = pd.Series(y).replace([-1,1],["New","Old"])
graph_3d(bow,pred)

In [156]:
clf = LocalOutlierFactor(n_neighbors=20, contamination = 0.1)
y = clf.fit_predict(bow)
pred = pd.Series(y).replace([-1,1],["New","Old"])
graph_3d(bow,pred)

**Isolation** **Forest**

In [157]:
clf = IsolationForest(contamination = 0.005)
y = clf.fit_predict(bow)
pred = pd.Series(y).replace([-1,1],["New","Old"])
graph_3d(bow,pred)