Created on Friday 8 January 2021

**Group 5 - Classification**  
**Prediction unsupervised v2**

@authors : Alice Airault

We try to detect articles that evoke new information. Here we use a representation provided by group 3, word2vec. We use 3 models of unsupervised: Local Outlier Factor (LOF), Isolation Forest and One Support Vector Machine (One SVM).


# Import

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create link between drive and notebook

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [3]:
# Representation of a PCA in 3D
def graph_3d(data: pd.DataFrame, prediction: pd.Series, name=("New", "Old"), axes=(0,1,2), opacity=(1, 0.4)):
    """Documentation
    Parameters:
        data: the data that will be show on the plot
        prediction: the prediction that will be used to make distinction bewteen the differents class on the plot
        name: colunms names
        axes: axes of pca to display points
        opacity: the transparency of the points on the graph
    """
    sc: StandardScaler = StandardScaler()
    X: np.ndarray = sc.fit_transform(data)

    pca: PCA = PCA(n_components=10)
    X: np.ndarray = pca.fit_transform(X)

    data_plot: pd.DataFrame = pd.DataFrame(X)
    data_plot["prediction"] = prediction.replace([-1,1], name)

    data_plot_new: pd.DataFrame = data_plot[data_plot["prediction"] == name[0]]
    data_plot_old: pd.DataFrame = data_plot[data_plot["prediction"] == name[1]]

    data: list = [go.Scatter3d(x=data_plot_old[axes[0]], y=data_plot_old[axes[1]], z=data_plot_old[axes[2]], name=name[1], mode='markers',marker=dict(size=6), opacity=opacity[1]),
                  go.Scatter3d(x=data_plot_new[axes[0]], y=data_plot_new[axes[1]], z=data_plot_new[axes[2]], name=name[0], mode='markers',marker=dict(size=6), opacity=opacity[0]),
            ]
    fig: go.Figure = go.Figure(data)
    fig.show()

In [4]:
# A prediction for a Forest Insulation model emerges
def duplicate_prediction(data_path: str) -> np.ndarray:
    """Documentation
    Parameters:
        data_path: path to data

    Out:
        prediction: list of prediction (-1 : News, 1: Common)
    """
    # Load data
    data: pd.DataFrame = None
    format: str = data_path.split(".")[-1]
    if format == "csv":
        data: pd.DataFrame = pd.read_csv(data_path) 

    if format == "json":
        data: pd.DataFrame = pd.read_json(data_path)

    if data is None:
        raise Exception("Data format error, " + format + " not supported (only csv or json)")
    
    # Prediction
    clf: IsolationForest = IsolationForest(random_state=0, contamination=0.005)
    prediction: np.ndarray = clf.fit_predict(data)
    
    return prediction

# Load Data

In [5]:
# word2vec
word2vec_v0: pd.DataFrame = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/word2vec_v0.json")
word2vec_v0.head()

Unnamed: 0,art_id,vect_art
0,1,"[-0.07390894740000001, 0.07067905370000001, 0...."
1,2,"[-0.0736145377, 0.0636288002, 0.0627773479, -0..."
2,25,"[-0.0734900758, 0.0640805811, 0.0574389212, -0..."
3,27,"[-0.050523147000000004, 0.0794918537, 0.050795..."
4,28,"[-0.0598673262, 0.073482275, 0.044069729700000..."


In [6]:
# Size of the list to know the number of columns in our word2vec
size_list:int = len(word2vec_v0['vect_art'][0])
df_word2vec: pd.DataFrame = word2vec_v0.copy()

# Create a column for each item in the list
for i in range(size_list):
  col_name: str = str(i)
  df_word2vec[col_name] = df_word2vec['vect_art'].apply(lambda x: x[i])

# Delete unnecessary columns for representation
df_word2vec.drop(['art_id', 'vect_art'], axis=1, inplace=True)

In [7]:
df_word2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499
0,-0.073909,0.070679,0.073165,-0.069208,0.026616,-0.02259,-0.071989,-0.076028,-0.033224,-0.004597,0.083297,-0.041929,0.043709,0.075784,0.018115,-0.00303,0.080063,0.034533,-0.014085,0.022441,0.018121,-0.108884,-0.026075,-0.008617,0.008888,0.040698,0.010991,-0.078238,0.138736,0.054017,0.013931,-0.03611,-0.06111,0.035954,0.020606,-0.072223,0.00726,0.049466,-0.032941,0.008207,...,0.05516,0.051303,-0.002474,-0.022025,0.00964,0.014007,0.009131,0.001318,0.083084,-0.049312,-0.020146,-0.054688,0.000782,0.019085,0.073297,-0.041075,-6.2e-05,0.083189,0.018205,0.005167,-0.012034,-0.088214,-0.050991,0.008128,0.035381,0.057362,-0.040564,-0.130879,-0.073201,0.023292,0.004137,0.015983,0.020872,-0.064975,-0.002167,0.055217,-0.072263,0.063938,0.003982,-0.059017
1,-0.073615,0.063629,0.062777,-0.073292,0.041755,0.012058,-0.088038,-0.067056,-0.025116,-0.000606,0.083126,-0.018113,0.057072,0.086671,0.014305,0.007355,0.083403,0.016191,0.001392,0.045194,0.0097,-0.096466,-0.020055,0.006165,-0.001592,0.064377,0.013699,-0.088419,0.126666,0.051394,0.002694,-0.019258,-0.049676,0.019437,0.020196,-0.082045,0.024584,0.060316,-0.049904,0.012561,...,0.031102,0.022057,0.005029,-0.006211,0.020586,0.005167,0.028764,0.01208,0.070662,-0.057491,-0.007095,-0.04713,-0.024867,0.001744,0.047591,-0.060108,-0.022331,0.09167,0.008713,0.02658,-0.024654,-0.086882,-0.051859,-0.00159,0.019366,0.035451,-0.036466,-0.115484,-0.064642,0.029927,0.022152,0.04072,0.02925,-0.036246,0.037524,0.049054,-0.064114,0.071442,0.029316,-0.047835
2,-0.07349,0.064081,0.057439,-0.075423,0.037651,-0.002264,-0.092307,-0.055835,-0.032521,0.01329,0.100901,-0.02212,0.047435,0.083168,0.01301,0.012844,0.089226,0.017863,-0.002933,0.049102,0.010592,-0.088073,-0.026377,0.006396,-0.005655,0.048325,0.015265,-0.089817,0.144428,0.054504,0.004552,-0.016781,-0.05602,0.03048,0.022718,-0.073067,0.009554,0.063822,-0.046102,0.012794,...,0.052375,0.035898,0.006144,-0.028142,0.010933,0.008656,0.031529,0.028811,0.078161,-0.058175,-0.009356,-0.03905,-0.028328,0.015019,0.054606,-0.032876,-0.014097,0.090402,0.026548,-0.000425,-0.041503,-0.090043,-0.061674,0.018953,0.016659,0.047559,-0.041774,-0.114472,-0.080249,0.029114,0.025406,0.02901,0.012918,-0.0294,0.031996,0.056691,-0.060661,0.057724,0.025466,-0.048824
3,-0.050523,0.079492,0.050795,-0.043132,0.037514,0.001665,-0.094767,-0.060245,-0.023566,0.005896,0.09299,-0.018225,0.045116,0.078232,0.035867,0.016397,0.099965,0.014674,-0.017502,0.030584,0.015009,-0.093868,-0.024154,-0.007653,-0.001712,0.067662,0.010908,-0.089364,0.146061,0.05272,0.019182,-0.027211,-0.054004,0.017968,0.02458,-0.082264,-0.006893,0.052423,-0.034207,0.027483,...,0.058583,0.05767,-0.014287,-0.025077,0.002593,0.001167,0.03417,0.025148,0.079926,-0.058198,0.003893,-0.045183,-0.034971,0.02154,0.06648,-0.028166,-0.016965,0.101413,0.003425,0.026112,-0.03546,-0.087747,-0.048536,0.009261,0.006127,0.041544,-0.037567,-0.120804,-0.09274,0.01359,0.014994,0.02039,0.010504,-0.051327,0.044157,0.062641,-0.047277,0.063333,0.029111,-0.047167
4,-0.059867,0.073482,0.04407,-0.056021,0.045032,0.003077,-0.100331,-0.057307,-0.017694,0.000493,0.094169,-0.020945,0.038037,0.075791,0.03387,0.011234,0.097877,0.011186,-0.014327,0.029125,0.01215,-0.098601,-0.017556,-0.006917,-0.006086,0.059455,0.012876,-0.091737,0.136972,0.066208,0.00307,-0.021218,-0.06306,0.030972,0.022012,-0.07693,0.005978,0.051101,-0.03068,0.029854,...,0.065276,0.053648,-0.013915,-0.030385,0.01143,0.006886,0.029016,0.023113,0.080206,-0.061116,0.00356,-0.052805,-0.019055,0.024102,0.079504,-0.024815,-0.013328,0.092995,-0.001851,0.033211,-0.030188,-0.07067,-0.05447,0.004074,0.017648,0.041844,-0.045897,-0.117089,-0.084564,0.019079,0.02323,0.02708,0.012516,-0.034951,0.039169,0.070937,-0.042894,0.06251,0.039318,-0.057177


# Representation Word2vec

In [8]:
df: pd.DataFrame = df_word2vec.copy()

## Local Outlier Facor (LOF)

In [9]:
# Prediction Word2vec with LOF
clf: LocalOutlierFactor = LocalOutlierFactor(contamination=0.03)
pred_lof: np.ndarray = clf.fit_predict(df)
pd.Series(pred_lof).value_counts()

 1    7306
-1     226
dtype: int64

In [10]:
graph_3d(df, pd.Series(pred_lof))

## Isolation Forest

In [11]:
# Prediction Word2vec with Isolation Forest
clf: IsolationForest = IsolationForest(contamination = 0.015) #0.015
pred_forest: np.ndarray = clf.fit_predict(df)
pd.Series(pred_forest).value_counts()

 1    7419
-1     113
dtype: int64

In [12]:
graph_3d(df, pd.Series(pred_forest))

## One Support Vector Machine (One SVM)

In [13]:
# Prediction Word2vec with One SVM
one_svm: OneClassSVM = OneClassSVM(gamma='auto', nu = 0.03).fit(df)
pred_svm: np.ndarray = one_svm.predict(df)
pd.Series(pred_svm).value_counts()

 1    7306
-1     226
dtype: int64

In [14]:
graph_3d(df, pd.Series(pred_svm), axes=(0,1,2))