Created on Thursday 7 January 2021

**Group 5 - Classification**  
**Prediction unsupervised v0.1**

@authors : Theo Vedis, Valentine Rossi, Maylin Vuillaume

Here, we do unsupervised on a representation made by the group 3 (Bag-of-Words (BoW)) with Local Outlier Factor (LOF), Isolation Forest and Support Vector Machine (One SVM). We visualize the results with a PCA. 


# Import

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create link between drive and notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [None]:
# Representation of a PCA in 3D
def graph_3d(data: pd.DataFrame, prediction: pd.Series, name=("New", "Old"), axes=(0,1,2), opacity=(1, 0.4)):
    """Documentation
    Parameters:
        data: the data that will be show on the plot
        prediction: the prediction that will be used to make distinction bewteen the differents class on the plot
        name: colunms names
        axes: axes of pca to display points
        opacity: the transparency of the points on the graph
    """
    sc: StandardScaler = StandardScaler()
    X: np.ndarray = sc.fit_transform(data)

    pca: PCA = PCA(n_components=10)
    X: np.ndarray = pca.fit_transform(X)

    data_plot: pd.DataFrame = pd.DataFrame(X)
    data_plot["prediction"] = prediction.replace([-1,1], name)

    data_plot_new: pd.DataFrame = data_plot[data_plot["prediction"] == name[0]]
    data_plot_old: pd.DataFrame = data_plot[data_plot["prediction"] == name[1]]

    data: list = [go.Scatter3d(x=data_plot_old[axes[0]], y=data_plot_old[axes[1]], z=data_plot_old[axes[2]], name=name[1], mode='markers',marker=dict(size=6), opacity=opacity[1]),
                  go.Scatter3d(x=data_plot_new[axes[0]], y=data_plot_new[axes[1]], z=data_plot_new[axes[2]], name=name[0], mode='markers',marker=dict(size=6), opacity=opacity[0]),
            ]
    fig: go.Figure = go.Figure(data)
    fig.show()

In [None]:
# A prediction for a Forest Insulation model emerges
def duplicate_prediction(data_path: str) -> np.ndarray:
    """Documentation
    Parameters:
        data_path: path to data

    Out:
        prediction: list of prediction (-1 : News, 1: Common)
    """
    # Load data
    data: pd.DataFrame = None
    format: str = data_path.split(".")[-1]
    if format == "csv":
        data: pd.DataFrame = pd.read_csv(data_path) 

    if format == "json":
        data: pd.DataFrame = pd.read_json(data_path)

    if data is None:
        raise Exception("Data format error, " + format + " not supported (only csv or json)")
    
    # Prediction
    clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005)
    prediction: np.ndarray = clf.fit_predict(data)
    
    return prediction

In [None]:
duplicate_prediction("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/BOW.csv")

array([1, 1, 1, ..., 1, 1, 1])

# Load Data

In [None]:
# Bag-of-Words (BoW)
bow_g3: pd.DataFrame = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/g3_BOW_v1.json")
bow_g3.drop(["art_id", "art_content_clean_without_lem"], axis=1, inplace=True)

# Representation Bag-of-Words (BoW) - Group 3

## Local Outlier Facor (LOF)

In [None]:
# Prediction BoW-G3 with LOF
clf: LocalOutlierFactor = LocalOutlierFactor(n_neighbors=2, contamination=0.1)
pred_lof_g3: np.ndarray = clf.fit_predict(bow_g3)
pd.Series(pred_lof_g3).value_counts()

 1    6780
-1     753
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_lof_g3), axes=(0,1,2))

## Isolation Forest

In [None]:
# Prediction BoW-G3 with Isolation Forest
clf: IsolationForest = IsolationForest(random_state=0, contamination=0.015)
pred_forest_g3: np.ndarray = clf.fit_predict(bow_g3)
pd.Series(pred_forest_g3).value_counts()

 1    7420
-1     113
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_forest_g3), axes=(0,1,2))

## Support Vector Machine (One SVM)

In [None]:
# Prediction BoW-G3 with One SVM
one_svm: OneClassSVM = OneClassSVM(gamma='auto', nu = 0.03).fit(bow_g3)
pred_svm_g3: np.ndarray = one_svm.predict(bow_g3)
pd.Series(pred_svm_g3).value_counts()

 1    7305
-1     228
dtype: int64

In [None]:
graph_3d(bow_g3, pd.Series(pred_svm_g3), axes=(0,1,2))