Created: Mon Jan  4 12:31:07 2021

**Group 5 - Classification**  
**Example of a notebook according to the quality charter** (Title)

@authors : T.V

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Create link between drive and notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import

Note: Import only the modules used in the notebook.

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
def graph_3d(data: pd.DataFrame, prediction: pd.Series, name: tuple=("New", "Old"), axes: tuple=(0,1,2), opacity: tuple=(1, 0.4)):
    """Documentation
    Parameters:
        data : the data that will be show on the plot
        prediction : the prediction that will be used to make distinction bewteen the differents class on the plot
        name : colunms names
        axes : axes of pca to display points
        opacity : the transparency of the points on the graph
    """
    sc: StandardScaler = StandardScaler()
    X: np.ndarray = sc.fit_transform(data)

    pca: PCA = PCA(n_components=10)
    X: np.ndarray = pca.fit_transform(X)

    data_plot: pd.DataFrame = pd.DataFrame(X)
    data_plot["prediction"] = prediction.replace([-1,1], name)

    data_plot_new: pd.DataFrame = data_plot[data_plot["prediction"] == name[0]]
    data_plot_old: pd.DataFrame = data_plot[data_plot["prediction"] == name[1]]

    data: list = [go.Scatter3d(x=data_plot_old[axes[0]], y=data_plot_old[axes[1]], z=data_plot_old[axes[2]], name=name[1], mode='markers',marker=dict(size=6), opacity=opacity[1]),
                  go.Scatter3d(x=data_plot_new[axes[0]], y=data_plot_new[axes[1]], z=data_plot_new[axes[2]], name=name[0], mode='markers',marker=dict(size=6), opacity=opacity[0]),
            ]
    fig: go.Figure = go.Figure(data)
    fig.show()




In [None]:
def duplicate_prediction(data_path: str) -> np.ndarray:
    """Documentation
    Parameters:
        data_path : path to data

    Out:
        prediction : list of prediction (-1 : News, 1: Common)

    """
    # Load data

    data: pd.DataFrame = None
    format: str = data_path.split(".")[-1]
    if format == "csv":
        data: pd.DataFrame = pd.read_csv(data_path) 

    if format == "json":
        data: pd.DataFrame = pd.read_json(data_path)

    if data is None:
        raise Exception("Data format error, " + format + " not supported (only csv or json)")
    
    # Prediction
    clf: IsolationForest = IsolationForest(random_state=0, contamination=0.015)
    prediction: np.ndarray = clf.fit_predict(data)
    
    return prediction

In [None]:
duplicate_prediction("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/BOW.csv")

array([1, 1, 1, ..., 1, 1, 1])

# Load Data

In [None]:
data: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data_With_Features_Syntax.csv")
data = data.query("art_lang == 'fr'")
data.drop(["art_id","art_content","art_content_html","art_extract_datetime","art_lang","art_title",
           "art_url","src_name","src_type","src_url","src_img","art_auth","art_tag","title_postive_score","title_negative_score",
           "title_polarity_score","title_subjectivity_score","netloc.com","ratio_word_title_on_word"],axis = 1, inplace = True)
data["average_word_sentence"] = data["average_word_sentence"].fillna(0)

data_normal = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data.json")
tf_idf: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/TF-IDF.csv")
tf_idf.drop(tf_idf[data_normal['art_lang'] != 'fr'].index, inplace=True)

bow: pd.DataFrame = pd.read_csv("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/BOW.csv")
bow.drop(bow[data_normal['art_lang'] != 'fr'].index, inplace=True)

bow_g3: pd.DataFrame = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/g3_BOW_v1.json")
bow_g3.drop(["art_id", "art_content_clean_without_lem"], axis=1, inplace=True)

glove_v1: pd.DataFrame = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/GloVe_v1.json")
glove_v1.drop(["art_id", "art_content_clean_without_lem", "spacy_vector"], axis=1, inplace=True)

# Data (Features)

## LOF

Utilisation de LocalOutlierFacor sur les features syntaxique faites par notre groupe

In [None]:
clf: LocalOutlierFactor = LocalOutlierFactor(n_neighbors=2, contamination=0.01)
pred_features_lof: np.ndarray = clf.fit_predict(data)
pd.Series(pred_features_lof).value_counts()

 1    12024
-1      122
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_features_lof), axes=(0,1,2))

## Isolation Forest

Utilisation de IsolationForest sur les features syntaxique faites par notre groupe  
Distance / Nouveau / doublons / atypique / anomalie

In [None]:
# Data
clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005, max_samples=1000, max_features=0.9, n_estimators=100)
pred_features_forest: np.ndarray = clf.fit_predict(data)
pd.Series(pred_features_forest).value_counts()

 1    12085
-1       61
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_features_forest))

# tf_idf

## LOF

Utilisation de LocalOutlierFacor sur les TF-IDF

In [None]:
# tf_idf + features
clf: LocalOutlierFactor = LocalOutlierFactor(n_neighbors=3, contamination=0.1)
pred_tf_lof: np.ndarray = clf.fit_predict(tf_idf)
pd.Series(pred_tf_lof).value_counts()

 1    10931
-1     1215
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_tf_lof))

## Isolation Forest

Utilisation de IsolationForest sur les TF-IDF

In [None]:
clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005)
pred_tf_forest: np.ndarray = clf.fit_predict(tf_idf)
pd.Series(pred_tf_forest).value_counts()

 1    12102
-1       44
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_tf_forest))

# bow

## LOF

Utilisation de LocalOutlierFacor sur le BOW

In [None]:
# bow
clf: LocalOutlierFactor = LocalOutlierFactor(n_neighbors=3, contamination=0.1)
pred_bow_lof: np.ndarray = clf.fit_predict(bow)
pd.Series(pred_bow_lof).value_counts()

 1    10931
-1     1215
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_bow_lof))

## Isolation Forest

Utilisation de IsolationForest sur le BOW

In [None]:
# bow
clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005)
pred_bow_forest: np.ndarray = clf.fit_predict(bow)
pd.Series(pred_bow_forest).value_counts()

 1    12085
-1       61
dtype: int64

In [None]:
graph_3d(bow, pd.Series(pred_bow_forest))

# BOW - G3

## SVM

In [None]:
one_svm: OneClassSVM = OneClassSVM(gamma='auto', nu = 0.03).fit(bow_g3)
pred_svm_g3: np.ndarray = one_svm.predict(bow_g3)
pd.Series(pred_svm_g3).value_counts()

 1    7305
-1     228
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_svm_g3), axes=(0,1,2))

## Isolation Forest

In [None]:
clf: IsolationForest = IsolationForest(random_state=0, contamination=0.015)
pred_forest_g3: np.ndarray = clf.fit_predict(bow_g3)
pd.Series(pred_forest_g3).value_counts()

 1    7420
-1     113
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_forest_g3), axes=(0,1,2))

## LOF

In [None]:
clf: LocalOutlierFactor = LocalOutlierFactor(n_neighbors=2, contamination=0.1)
pred_lof_g3: np.ndarray = clf.fit_predict(bow_g3)
pd.Series(pred_lof_g3).value_counts()

 1    6780
-1     753
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_lof_g3), axes=(0,1,2))

# Glove

## LOF

In [None]:
clf: LocalOutlierFactor = LocalOutlierFactor(contamination=0.03)
pred_glove_lof: np.ndarray = clf.fit_predict(glove_v1)
pd.Series(pred_glove_lof).value_counts()

 1    7307
-1     226
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_glove_lof))

## Isolation Forest

In [None]:
clf: IsolationForest = IsolationForest(contamination = 0.015)
pred_glove_forest: np.ndarray = clf.fit_predict(glove_v1)
pd.Series(pred_glove_forest).value_counts()

 1    7420
-1     113
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_glove_forest))

## SVM

In [None]:
one_svm: OneClassSVM = OneClassSVM(gamma='auto', nu = 0.03).fit(glove_v1)
pred_svm_glove: np.ndarray = one_svm.predict(glove_v1)
pd.Series(pred_svm_glove).value_counts()

 1    7306
-1     227
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_svm_glove), axes=(0,1,2))