Created on Friday 8 January 2021

**Group 5 - Classification**  
**Prediction unsupervised v1**

@authors : Theo Vedis, Valentine Rossi, Maylin Vuillaume

We try to detect articles that evoke new information. Here we use a representation provided by group 3, Glove. We use 3 models of unsupervised: Local Outlier Factor (LOF), Isolation Forest and Support Vector Machine (One SVM).


# Import

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Create link between drive and notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [None]:
# Representation of a PCA in 3D
def graph_3d(data: pd.DataFrame, prediction: pd.Series, name=("New", "Old"), axes=(0,1,2), opacity=(1, 0.4)):
    """Documentation
    Parameters:
        data: the data that will be show on the plot
        prediction: the prediction that will be used to make distinction bewteen the differents class on the plot
        name: colunms names
        axes: axes of pca to display points
        opacity: the transparency of the points on the graph
    """
    sc: StandardScaler = StandardScaler()
    X: np.ndarray = sc.fit_transform(data)

    pca: PCA = PCA(n_components=10)
    X: np.ndarray = pca.fit_transform(X)

    data_plot: pd.DataFrame = pd.DataFrame(X)
    data_plot["prediction"] = prediction.replace([-1,1], name)

    data_plot_new: pd.DataFrame = data_plot[data_plot["prediction"] == name[0]]
    data_plot_old: pd.DataFrame = data_plot[data_plot["prediction"] == name[1]]

    data: list = [go.Scatter3d(x=data_plot_old[axes[0]], y=data_plot_old[axes[1]], z=data_plot_old[axes[2]], name=name[1], mode='markers',marker=dict(size=6), opacity=opacity[1]),
                  go.Scatter3d(x=data_plot_new[axes[0]], y=data_plot_new[axes[1]], z=data_plot_new[axes[2]], name=name[0], mode='markers',marker=dict(size=6), opacity=opacity[0]),
            ]
    fig: go.Figure = go.Figure(data)
    fig.show()

In [None]:
# A prediction for a Forest Insulation model emerges
def duplicate_prediction(data_path: str) -> np.ndarray:
    """Documentation
    Parameters:
        data_path: path to data

    Out:
        prediction: list of prediction (-1 : News, 1: Common)
    """
    # Load data
    data: pd.DataFrame = None
    format: str = data_path.split(".")[-1]
    if format == "csv":
        data: pd.DataFrame = pd.read_csv(data_path) 

    if format == "json":
        data: pd.DataFrame = pd.read_json(data_path)

    if data is None:
        raise Exception("Data format error, " + format + " not supported (only csv or json)")
    
    # Prediction
    clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005)
    prediction: np.ndarray = clf.fit_predict(data)
    
    return prediction

In [None]:
duplicate_prediction("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/BOW.csv")

array([1, 1, 1, ..., 1, 1, 1])

# Load Data

In [None]:
# Glove
glove_v1: pd.DataFrame = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/GloVe_v1.json")
glove_v1.drop(["art_id", "art_content_clean_without_lem", "spacy_vector"], axis=1, inplace=True)

# Representation Glove

## Local Outlier Facor (LOF)

In [None]:
# Prediction Glove with LOF
clf: LocalOutlierFactor = LocalOutlierFactor(contamination=0.03)
pred_glove_lof: np.ndarray = clf.fit_predict(glove_v1)
pd.Series(pred_glove_lof).value_counts()

 1    7307
-1     226
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_glove_lof))

## Isolation Forest

In [None]:
# Prediction Glove with Isolation Forest
clf: IsolationForest = IsolationForest(contamination = 0.015)
pred_glove_forest: np.ndarray = clf.fit_predict(glove_v1)
pd.Series(pred_glove_forest).value_counts()

 1    7420
-1     113
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_glove_forest))

## Support Vector Machine (One SVM)

In [None]:
# Prediction Glove with One SVM
one_svm: OneClassSVM = OneClassSVM(gamma='auto', nu = 0.03).fit(glove_v1)
pred_svm_glove: np.ndarray = one_svm.predict(glove_v1)
pd.Series(pred_svm_glove).value_counts()

 1    7306
-1     227
dtype: int64

In [None]:
graph_3d(glove_v1, pd.Series(pred_svm_glove), axes=(0,1,2))