Created on Wednesday 15 January 2021

**Group 5 - Classification**  
**Extraction features syntaxe**

@authors : Jeremy Johann

The notebook that contain all the code. You just need to put in the fonction the art_id, the content with lemmatization, the title of the article and the Bag Of Word (BOW). This code load all the model and give the prediction.

# Pip install

In [1]:
pip install scikit-learn-extra

Collecting scikit-learn-extra
[?25l  Downloading https://files.pythonhosted.org/packages/d1/dd/891d2ee7bd18af8f2e1df5d63a52ee96edd0eacc21bb9627072b1c5f6a6c/scikit-learn-extra-0.1.0b2.tar.gz (615kB)
[K     |▌                               | 10kB 15.3MB/s eta 0:00:01[K     |█                               | 20kB 11.3MB/s eta 0:00:01[K     |█▋                              | 30kB 7.9MB/s eta 0:00:01[K     |██▏                             | 40kB 7.4MB/s eta 0:00:01[K     |██▋                             | 51kB 4.4MB/s eta 0:00:01[K     |███▏                            | 61kB 5.0MB/s eta 0:00:01[K     |███▊                            | 71kB 5.1MB/s eta 0:00:01[K     |████▎                           | 81kB 5.7MB/s eta 0:00:01[K     |████▉                           | 92kB 5.7MB/s eta 0:00:01[K     |█████▎                          | 102kB 5.7MB/s eta 0:00:01[K     |█████▉                          | 112kB 5.7MB/s eta 0:00:01[K     |██████▍                         | 122kB

# Import librairies

In [2]:
import re
import os
import nltk
import math
import string
import pickle
import warnings
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
from tqdm import tqdm_notebook as tqdm

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from scipy.spatial import distance

from xgboost import XGBClassifier

from sklearn import svm
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn_extra.cluster import KMedoids
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import calinski_harabasz_score, silhouette_score, f1_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, IsolationForest

warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [None]:
# Function who take in input the data and who give in output the novelty score and if the document is new or not
def prediction_nouveau(data: pd.DataFrame) -> pd.DataFrame:
    """Documentation
    Parameters:
        data: Dataframe with the data to predit
    Out:
        prediction: list of prediction (-1 : News, 1: Common)
    """
    # We create a dataframe where we will save the prediction
    dataout: pd.DataFrame = pd.DataFrame(data["art_id"].values,columns = ["art_id"])
    data: pd.DataFrame = data.drop(["art_id"],axis = 1)

    # We create the standard scaler that will normalize the data
    sc: StandardScaler = StandardScaler()
    X: np.ndarray = sc.fit_transform(data)

    # We create the TSNE that will make a reduction dimension
    tsne: TSNE = TSNE(n_components=3)
    X: np.ndarray = tsne.fit_transform(X)

    # We create model and we predict
    clf: IsolationForest = IsolationForest(random_state=0, contamination=0.01)
    dataout["prediction_nouveau"]: np.ndarray = clf.fit_predict(X)
    dataout["score_nouveau"]: np.ndarray = clf.decision_function(X)

    # We return prediction
    return dataout

In [None]:
# Function who take in input the data and who give in output the score of innovation, the score of gestion, if the document is related to innovation and 
#if the document is related to gestion
def prediction_innovant_gestion(data: pd.DataFrame, path_load: str) -> pd.DataFrame:
  """Documentation
    Parameters:
        data: Dataframe with the data to predit
        path_load: path to the model to load

    Out:
        dataout: Dataframe with the prediction
  """
  # We create a dataframe where we will save the prediction
  dataout: pd.DataFrame = pd.DataFrame(data["art_id"].values, columns = ["art_id"])

  # We load the model
  model_innovant = pickle.load(open(path_load + "model_innovant.pkl", 'rb'))
  model_gestion = pickle.load(open(path_load + "model_gamme_gestion.pkl", 'rb'))

  # We make the prediction and score of predition
  dataout["prediction_innovant"] = model_innovant.predict(data.drop(["art_id"], axis = 1))
  dataout["score_innovant"] = model_innovant.predict_proba(data.drop(["art_id"], axis = 1))[:,1]
  dataout["prediction_gamme_gestion"] = model_gestion.predict(data.drop(["art_id"], axis = 1))
  dataout["score_gamme_gestion"] = model_gestion.predict_proba(data.drop(["art_id"], axis = 1))[:,1]

  # We return the prediction
  return dataout 

In [None]:
# Function who take in input the data and who give in output the cluster of the document
def prediction_clustering(data: pd.DataFrame, path_load: str) -> pd.DataFrame:
  """Documentation
    Parameters:
        data: Dataframe with the data to predit
        path_load: path to the model to load
    
    Out:
        dataout: Dataframe with the prediction
  """
  # We create a dataframe where we will save the prediction
  dataout: pd.DataFrame = pd.DataFrame(data["art_id"].values, columns = ["art_id"])

  #We load the standart scaler to normalize the data
  sc = pickle.load(open(path_load + "scaler_cluster.pkl", 'rb'))
  #We load the pca
  pca = pickle.load(open(path_load + "pca_cluster.pkl", 'rb'))
  #We load the model
  model = pickle.load(open(path_load + "model_cluster.pkl", 'rb'))

  #We predict
  X = sc.transform(data.drop(["art_id"], axis = 1))
  X = pca.transform(X)
  dataout["prediction_theme"] = np.argmax(model.transform(X), axis=1)

  #We return data
  return dataout

In [None]:
# Function who take in input the data and who give in output all the prediction
def all_prediction(data: pd.DataFrame,path_load: str) -> pd.DataFrame:
  '''Documentation
  Parameters:
      data: A dataframe containing 3 columns(art_id, art_content, art_uitle) and the bow
      path_load: path to the model to load
  
  Out:
      data : A dataframe with all the the predicted features
  '''
  # We compute innovant and gamme gestion
  print("Phase 1")
  # We predict the document related to innovation and gestion
  data_temp = prediction_innovant_gestion(data.drop(["art_content_clean_without_lem"],axis = 1),path_load)
  data = data.merge(data_temp, on = "art_id")
  print("Phase 2")
  # We predict the new documents
  data_temp_2 = prediction_nouveau(data.query("prediction_innovant == 1").query("prediction_gamme_gestion == 1").drop(["art_content_clean_without_lem","prediction_innovant","prediction_gamme_gestion","score_innovant","score_gamme_gestion"],axis = 1))
  print("Phase 3")
  # We predict the clusters
  data_temp_3 = prediction_clustering(data.query("prediction_innovant == 1").query("prediction_gamme_gestion == 1").drop(["art_content_clean_without_lem","prediction_innovant","prediction_gamme_gestion","score_innovant","score_gamme_gestion"],axis = 1),path_load)
  print("Contatenation")
  # We merge all the data
  data = data.merge(data_temp_2,how = "left",on = "art_id")
  data = data.merge(data_temp_3,how = "left",on = "art_id")

  # We select only the columns that we want
  dataout = data[["art_id","prediction_innovant","prediction_gamme_gestion","score_innovant","score_gamme_gestion","prediction_nouveau","score_nouveau","prediction_theme"]]
  dataout.rename(columns = { "prediction_nouveau" : "nouveau","prediction_theme" : "theme"}, inplace = True)

  # We change the name of the clusters
  dict_label = pickle.load(open(path_load + "dict_label.pkl", 'rb'))
  dataout["theme"] = dataout["theme"].map(dict_label)

  return dataout

# We run the function

In [None]:
# We load data
data = pd.read_json(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/g3_BOW_v1.json")

In [None]:
# We make all prediction
output = all_prediction(data,"/content/drive/MyDrive/G5 Inter-Promo 2021/Ressources/test/")

Phase 1
Phase 2
Phase 3
Contatenation


In [4]:
# We save the data
output = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Global/Global_V2.json")