Created on Wednesday 13 January 2021

**Group 5 - Classification   
Clean gestion and innovation detection semi supervised v1**

@authors : Lilian Dulinge

This notebook includes all function useful for predict if a document talk about management range or innovation without graphs representation. This is the document we must use if you want results. We can choose the type of features we want to use, the theme of documents we want (management range or innovations) and the results we want to save. It works like the V1, we define a documents linked or not linked at all with the theme and a big part of which we don't know if they talk about this theme or not and we classifie them with semi_supervised methods. Then to that, we use a supervised method which trains on the labeled data to predict the last result

# Import libraries

In [None]:
import re
import nltk
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from xgboost import XGBClassifier

from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Create link between drive and notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import data

In [None]:
bow: pd.DataFrame = pd.read_json(
    '/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/g3_BOW_v1.json')
data: pd.DataFrame = pd.read_csv(
    '/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/Data_With_Features_Syntax.csv')
df: pd.DataFrame = pd.merge(left=bow, right=data, on='art_id')
art_lemma: pd.DataFrame = pd.read_json(
    '/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/df_articles_lemma.json')
df: pd.DataFrame = pd.merge(
    left=df, right=art_lemma, left_on='art_id', right_on='art_lemma_id')

In [None]:
df_lexique_innovation: pd.DataFrame = pd.read_json(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/df_lexique_lemma.json")

df_lexique_gestion: pd.DataFrame = pd.read_json(
    "/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/df_lexique_gammes_gestion.json")

# Functions

In [None]:
# Counts the number of words
def nb_word(text: list) -> int:
    """Documentation
    Parameters:
        text: Texts of the article

    Out (if exists):
        nb_word: Number of word in  the document
    """
    nb_words: list = []
    nb: int = 0

    # Browse through the different texts
    for i in text:
        # Removes special characters
        i.replace(',', ' ')
        i.replace('.', ' ')
        i.replace('!', ' ')
        i.replace('?', ' ')
        i.replace('/', ' ')
        # Creates a list with all the words present in the text
        list_words: list = i.split()
        # Counts the number of words present in the text
        nb_words.append(len(list_words))

    return nb_words

In [None]:
# Count the number of time where the words in the list appear
def count_key_words(data: pd.DataFrame, l: list) -> list:
    """Documentation
    Parameters:
        data: List of articles
        l: List of word that we will check in the sentences

    Out:
        res: List where each value is the number of time where key word appear in the article
    """
    list_mot_unique: list = []
    list_mot_compose: list = []
    for elem in l:
        cpt: int = 0
        for car in elem:
            if (car == " "):
                cpt += 1
        if cpt == 0:
            list_mot_unique.append(elem)
        else:
            list_mot_compose.append(elem)
    res: list = []
    for i in tqdm(range(len(data))):
        sentence: str = data[i]
        if sentence is None:
            res.append(0)
        else:
            sentence = sentence.lower()
            sentence = sentence.split()
            t: int = 0
            for j in sentence:
                if (j in list(list_mot_unique)):
                    t = t + 1
            sentence: str = data[i]
            for elem in list_mot_compose:
                if elem in sentence:
                    t = t + 1
        res.append(t)
    
    return res

In [None]:
# Count the number of sentence
def sentences(data: pd.DataFrame, col: str) -> list:
    """Documentation
    Parameters:
        data: Dataframe with all the data
        columns: The columns of the dataframe that we will use

    Out:
        l: List where each value is the number of sentence in a article
    """
    l: list = []
    for i in range(len(data[col])):
        sentences: str = data[col][i]
        if not isinstance(sentences, str):
            sentences: str = str(sentences)
        if (sentences is None):
            count_sentence.append(0)
        else:
            sentences = sentences.replace("..", ".")
            sentences = sentences.replace("...", ".")
            sentences = sentences.replace("!", ".")
            sentences = sentences.replace("!!", ".")
            sentences = sentences.replace("!!!", ".")
            sentences = sentences.replace("?", ".")
            sentences = sentences.replace("??", ".")
            sentences = sentences.replace("???", ".")
            sentences = sentences.replace("?!", ".")
            sentences = sentences.replace("!?", ".")
            l.append(len(sent_tokenize(sentences)))

    return l

In [None]:
# Count the number of distinct words in each documents
def count_words_diff(df: pd.DataFrame, list_key: list) -> list:
    """Documentation
    Parameter:
        df: Column who contain text of article
        list_key: List of key words

    Out:
        result: Lenght of all dictionary by article, who represents the number of distinct words
    """
    result: list = []
    for j in tqdm(range(len(df))):
        liste: list = df[j].split()
        dico: dict = {}
        fait: bool = False
        for i in range(len(liste)):
            try:
                if (liste[i]+' '+liste[i+1]+' '+liste[i+2]) in list_key:
                    dic(liste[i]+' '+liste[i+1]+' '+liste[i+2], dico)
                    fait = True
                else:
                    try:
                        if (liste[i]+' '+liste[i+1]) in list_key:
                            dic(liste[i]+' '+liste[i+1], dico)
                            fait = True
                    except:
                        pass
            except:
                pass
            if not fait:
                dic(liste[i], dico)
            fait = False
        result.append(len(dico))

    return result

In [None]:
# Return a dico with all the distinct word and their numbers of appearances
def comparaison_words_diff(texte: str, list_key: list) -> dict:
    """Documentation
    Parameter:
        texte: Text of an article
        list_key : List of key words

    Out:
        dico: A dictionnary of all words presents in documents with numbers of their apparition
    """
    liste: list = texte.split()
    dico: dict = {}
    fait: bool = False
    for i in range(len(liste)):
        try:
            if (liste[i]+' '+liste[i+1]+' '+liste[i+2]) in list_key:
                dic(liste[i]+' '+liste[i+1]+' '+liste[i+2], dico)
                fait = True
            else:
                try:
                    if (liste[i]+' '+liste[i+1]) in list_key:
                        dic(liste[i]+' '+liste[i+1], dico)
                        fait = True
                except:
                    pass
        except:
            pass
        if not fait:
            dic(liste[i], dico)
        fait = False

    return dico

In [None]:
# Count the distinct word of a list
def count_key_words_diff(liste: list) -> dict:
    """Documentation
    Parameters:
        liste: List of key words

    Out:
        dico: Dictionnary with all key words in list
    """
    dico: dict = {}
    for i in range(len(liste)):
        dic(liste[i], dico)

    return (dico)

In [None]:
# Look up a word in a dictionary
def dic(term: str, dico: dict):
    """Documentation
    Parameters:
        term: One or set of words
        dico: Dictionary
    """
    if term in dico.keys():
        dico[term] += 1
    elif term != '':
        dico[term] = 1

In [None]:
# Count the number of distinct key words of a list present in each text
def key_word_in_doc(df: pd.DataFrame, list_key: list) -> list:
    """Documentation
    Parameter:
        df: Column of dataframe who contains all text
        list_key: List of keys word
    """
    key_word: list = []
    for i in tqdm(range(len(df))):
        sortie: dict = comparaison_words_diff(df[i], list_key)
        liste_cle: list = []
        for cle in sortie.keys():
            liste_cle.append(cle)
        tot: int = 0
        for i in liste_cle:
            if i in (list_key.tolist()):
                tot = tot + 1
        key_word.append(tot)

    return (key_word)

In [None]:
# Use to generate weighted randomness to say if a document talk about management range or not
def score_to_threshold(x: int) -> int:
    """Documentation
    Parameters:
        x: Innovation score associated with an article
    
    Out:
        Threshold probability used to differentiate innovative from non-innovative documents
    """

    return np.arctan(x * 100) / np.pi * 2 * 0.2 + 0.8

In [None]:
# Function to tell if a document talk about management range or not
def gestion(data_nb1: pd.DataFrame, data_nb2: pd.DataFrame, data_nb3: pd.DataFrame, data_ratio1: pd.DataFrame, data_ratio2: pd.DataFrame, data_ratio3: pd.DataFrame) -> list:
    """Documentation
    Parameters:
        data_nb1: First column of a dataframe who describe a number of key words presents in text
        data_nb2: Second column of a dataframe who describe a number of key words presents in title
        data_nb3: Third column who describe the distinct number of key word presents in text
        data_ratio1: Linked column of data_nb1 who represents a ratio of key word apparition in text
        compared to the total number of words in text
        data_ratio2: Linked column of data_nb2 who represents a ratio of key word apparition in title
        compared to the total number of words in title
        data_ratio3: Linked column of data_nb3 who represents a ratio of distinct key word apparition in text
        compared to the total number of distinct word in text

    Out:
        list: represents whether a document is innovative or not (i.e. yes = 1, no = 0, don't know = ?)
    """
    res: list = []
    seuil: float = 0.70

    # The higher score show document who talk about management range and the weakest show document who don't talk about management range
    for i in tqdm(range(len(data_nb1))):
        valeur = 0.5*(data_nb1[i]*data_ratio1[i]) + 0.5 * \
            (data_nb2[i]*data_ratio2[i]) + (data_nb3[i]*data_ratio3[i])
        alea: float = np.random.random()
        # We define a threshold and the document who have a score higher of this threshold take a 1 who represent management range
        if valeur > seuil:
            res.append(1)
        # Documents whith little score are consider such as they don't talk about management range take 0
        elif (valeur < seuil) & (data_nb1[i] == 0) & (data_nb2[i] == 0):
            res.append(0)
        elif (valeur < seuil) & (data_ratio1[i] < 0.0002) & (data_ratio2[i] < 0.0005):
            res.append(0)
        else:
            if valeur > seuil / 2 and alea > score_to_threshold((seuil - valeur)):
                res.append(1)
            elif valeur < seuil / 2 and alea > score_to_threshold(valeur):
                res.append(0)
            # The others have a '?' because we don't know if they talk about management range or not and we try in the rest of notebooks to create labels
            # for the documents with '?' thanks to documents who have labels
            else:
                res.append('?')

        return res

In [None]:
# Function to tell if a document talk about innovation or not
def innovation(data_nb1: pd.DataFrame, data_nb2: pd.DataFrame, data_nb3: pd.DataFrame, data_ratio1: pd.DataFrame, data_ratio2: pd.DataFrame, data_ratio3: pd.DataFrame) -> list:
    """Documentation
    Parameters:
        data_nb1: First column of a dataframe who describe a number of key words presents in text
        data_nb2: Second column of a dataframe who describe a number of key words presents in title
        data_nb3: Third column who describe the distinct number of key word presents in text
        data_ratio1: Linked column of data_nb1 who represents a ratio of key word apparition in text
        compared to the total number of words in text
        data_ratio2: Linked column of data_nb2 who represents a ratio of key word apparition in title
        compared to the total number of words in title
        data_ratio3: Linked column of data_nb3 who represents a ratio of distinct key word apparition in text
        compared to the total number of distinct word in text

    Out:
        list: Represents score of innovation, calculate in the function innovation
      """
    res: list = []
    seuil: float = 0.15

    # The higher score show document who talk about management range and the weakest show document who don't talk about management range
    for i in tqdm(range(len(data_nb1))):
        valeur: float = 0.5*(data_nb1[i]*data_ratio1[i]) + 0.5 * \
            (data_nb2[i]*data_ratio2[i]) + (data_nb3[i]*data_ratio3[i])
        alea: float = np.random.random()
        # We define a threshold and the document who have a score higher of this threshold take a 1 who represent management range
        if valeur > seuil:
            res.append(1)
        # Documents whith little score are consider such as they don't talk about management range take 0
        elif (valeur < seuil) & (data_nb1[i] == 0) & (data_nb2[i] == 0):
            res.append(0)
        elif (valeur < seuil) & (data_ratio1[i] < 0.0002) & (data_ratio2[i] < 0.0005):
            res.append(0)
        else:
            if valeur > seuil / 2 and alea > score_to_threshold((seuil - valeur)):
                res.append(1)
            elif valeur < seuil / 2 and alea > score_to_threshold(valeur):
                res.append(0)
            # The others have a '?' because we don't know if they talk about management range or not and we try in the rest of notebooks to create labels
            # for the documents with '?' thanks to documents who have labels
            else:
                res.append('?')
    return res

In [None]:
# Function use to create all features we need
def create_features(df: pd.DataFrame, texte: str, title: str, inno_ges: str) -> pd.DataFrame:
    """Documentation
    Parameters:
        df: A dataframe on which we will create features
        texte : Name of column who contain text
        title : Name of column who contain title of article
        inno_ges : String to specify if we want an analyse on management range or innovation.
        We can only choose options "innovation" or "gestion"
    """
    if (inno_ges == 'innovation'):
        df_lexique: pd.DataFrame = df_lexique_innovation
    elif (inno_ges == 'gestion'):
        df_lexique: pd.DataFrame = df_lexique_gestion

    df["nb_key_words"]: pd.DataFrame = count_key_words(
        df[texte], df_lexique["key_words_lemma"])
    df["nb_key_words_title"]: pd.DataFrame = count_key_words(
        df[title], df_lexique["key_words_lemma"])
    df["nb_words"]: pd.DataFrame = nb_word(df[texte])
    df["nb_words_title"]: pd.DataFrame = nb_word(df[title])
    df["nb_sentences"]: pd.DataFrame = sentences(df, texte)
    df["average_word_sentence"]: pd.DataFrame = df["nb_words"] / df["nb_sentences"]
    df["ratio_word_title_on_word"]: pd.DataFrame = df["nb_words_title"] / df["nb_words"]
    df['ratio_key_words']: pd.DataFrame = df['nb_key_words']/df['nb_words']
    df['ratio_key_words']: pd.DataFrame = df['ratio_key_words'].fillna(0)
    df['ratio_key_sentences']: pd.DataFrame = df['nb_key_words']/df['nb_sentences']
    df['ratio_key_sentences']: pd.DataFrame = df['ratio_key_sentences'].fillna(
        0)
    df['ratio_key_word_title']: pd.DataFrame = df['nb_key_words_title'] / \
        df['nb_words_title']
    df['ratio_key_word_title']: pd.DataFrame = df['ratio_key_word_title'].fillna(
        0)
    df['word_key_diff']: pd.DataFrame = key_word_in_doc(
        df['art_lemma'], df_lexique['key_words_lemma'])
    df['word_diff']: pd.DataFrame = count_words_diff(
        df['art_lemma'], df_lexique['key_words_lemma'])
    df['ratio_key_word_diff']: pd.DataFrame = df['word_key_diff'] / df['word_diff']

    return df

In [None]:
# Function use for create a label for each documents
def create_label(df: pd.DataFrame, features: list, inno_ges: str, texte: str, title: str) -> pd.DataFrame:
    """Documentation
    Parameters:
        df: A dataframe who contains article that we are going to classify
        features: Features we want to use for training our model.
        texte: Name of column who contain text
        title: Name of column who contain title of article
        inno_ges: String to specify if we want an analyse on management range or innovation.
        We can only choose options "innovation" or "gestion"
    """
    df: pd.DataFrame = create_features(df, texte, title, inno_ges)

    if (inno_ges == 'innovation'):
        df['prediction_supervise']: pd.DataFrame = innovation(
            df['nb_key_words'], df['nb_key_words_title'], df['word_key_diff'], df['ratio_key_words'], df['ratio_key_word_title'], df['ratio_key_word_diff'])
    elif (inno_ges == 'gestion'):
        df['prediction_supervise']: pd.DataFrame = gestion(
            df['nb_key_words'], df['nb_key_words_title'], df['word_key_diff'], df['ratio_key_words'], df['ratio_key_word_title'], df['ratio_key_word_diff'])

    df: pd.DataFrame = df.sample(
        frac=1, random_state=15).reset_index(drop=True)
    df['index']: pd.DataFrame = df.index

    var_useful: pd.DataFrame = df[features]
    var_useful: pd.DataFrame = pd.concat(
        [var_useful, df['prediction_supervise']], axis=1)

    all: pd.DataFrame = var_useful[var_useful['prediction_supervise'] != '?']
    unlabeled: pd.DataFrame = var_useful[var_useful['prediction_supervise'] == '?']

    X_train: pd.DataFrame = all.drop('prediction_supervise', axis=1)
    y_train: pd.DataFrame = all.prediction_supervise
    y_train: pd.DataFrame = pd.to_numeric(y_train)

    X_unlabeled: pd.DataFrame = unlabeled.drop('prediction_supervise', axis=1)

    if (inno_ges == 'innovation'):
        model1 = svm.SVC(C=3, kernel='linear', probability=True)
        model2: LogisticRegression = LogisticRegression(
            penalty='l1', solver='liblinear')
        model3: XGBClassifier = XGBClassifier()
    elif (inno_ges == 'gestion'):
        model1: XGBClassifier = XGBClassifier()
        model2: LogisticRegression = LogisticRegression(
            penalty='l1', solver='liblinear')
        model3 = svm.SVC(C=4, kernel='linear', probability=True)
    # Initiate iteration counter
    iterations: int = 0

    # Containers to hold f1_scores and # of pseudo-labels
    train_f1s: list = []
    pseudo_labels: list = []

    # Assign value to initiate while loop
    high_prob: list = [1]

    # Loop will run until there are no more high-probability pseudo-labels
    while len(high_prob) > 0:

        # Fit classifier and make train/test predictions
        model1.fit(X_train, y_train)
        y_hat_train: np.ndarray = model1.predict(X_train)

        # Calculate and print iteration # and f1 scores, and store f1 scores
        train_f1: float = f1_score(y_train, y_hat_train)
        print(f"Iteration {iterations}")
        print(f"Train f1: {train_f1}")
        train_f1s.append(train_f1)

        if (len(X_unlabeled) > 0):
            # Generate predictions and probabilities for unlabeled data
            print(f"Now predicting labels for unlabeled data...")

            pred_probs: np.ndarray = model1.predict_proba(X_unlabeled)
            preds: np.ndarray = model1.predict(X_unlabeled)
            prob_0: list = pred_probs[:, 0]
            prob_1: list = pred_probs[:, 1]

            # Store predictions and probabilities in dataframe
            df_pred_prob: pd.DataFrame = pd.DataFrame([])
            df_pred_prob['preds']: pd.DataFrame = preds
            df_pred_prob['prob_0']: pd.DataFrame = prob_0
            df_pred_prob['prob_1']: pd.DataFrame = prob_1
            df_pred_prob.index: pd.DataFrame = X_unlabeled.index

            # Separate predictions with > 99% probability
            high_prob: pd.DataFrame = pd.concat([df_pred_prob.loc[df_pred_prob['prob_0'] > 0.99],
                                                 df_pred_prob.loc[df_pred_prob['prob_1'] > 0.99]],
                                                axis=0)
            print(
                f"{len(high_prob)} high-probability predictions added to training data.")

            pseudo_labels.append(len(high_prob))

            # Add pseudo-labeled data to training data
            X_train: pd.DataFrame = pd.concat(
                [X_train, X_unlabeled.loc[high_prob.index]], axis=0)
            y_train: pd.DataFrame = pd.concat([y_train, high_prob.preds])

            # Drop pseudo-labeled instances from unlabeled data
            X_unlabeled: pd.DataFrame = X_unlabeled.drop(index=high_prob.index)
            print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

            # Update iteration counter
            iterations += 1
        else:
            high_prob: list = []
            print(f'end of process.')

    # Initiate iteration counter
    iterations: int = 0

    # Containers to hold f1_scores and # of pseudo-labels
    train_f1s: list = []
    pseudo_labels: list = []

    # Assign value to initiate while loop
    high_prob: list = [1]

    # Loop will run until there are no more high-probability pseudo-labels
    while len(high_prob) > 0:

        # Fit classifier and make train/test predictions
        model2.fit(X_train, y_train)
        y_hat_train: np.ndarray = model2.predict(X_train)

        # Calculate and print iteration # and f1 scores, and store f1 scores
        train_f1: float = f1_score(y_train, y_hat_train)
        print(f"Iteration {iterations}")
        print(f"Train f1: {train_f1}")
        train_f1s.append(train_f1)

        if (len(X_unlabeled) > 0):
            # Generate predictions and probabilities for unlabeled data
            print(f"Now predicting labels for unlabeled data...")

            pred_probs: np.ndarray = model2.predict_proba(X_unlabeled)
            preds: np.ndarray = model2.predict(X_unlabeled)
            prob_0: list = pred_probs[:, 0]
            prob_1: list = pred_probs[:, 1]

            # Store predictions and probabilities in dataframe
            df_pred_prob: pd.DataFrame = pd.DataFrame([])
            df_pred_prob['preds']: pd.DataFrame = preds
            df_pred_prob['prob_0']: pd.DataFrame = prob_0
            df_pred_prob['prob_1']: pd.DataFrame = prob_1
            df_pred_prob.index: pd.DataFrame = X_unlabeled.index

            # Separate predictions with > 99% probability
            high_prob: pd.DataFrame = pd.concat([df_pred_prob.loc[df_pred_prob['prob_0'] > 0.99],
                                                 df_pred_prob.loc[df_pred_prob['prob_1'] > 0.99]],
                                                axis=0)
            print(
                f"{len(high_prob)} high-probability predictions added to training data.")

            pseudo_labels.append(len(high_prob))

            # Add pseudo-labeled data to training data
            X_train: pd.DataFrame = pd.concat(
                [X_train, X_unlabeled.loc[high_prob.index]], axis=0)
            y_train: pd.DataFrame = pd.concat([y_train, high_prob.preds])

            # Drop pseudo-labeled instances from unlabeled data
            X_unlabeled: pd.DataFrame = X_unlabeled.drop(index=high_prob.index)
            print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

            # Update iteration counter
            iterations += 1
        else:
            high_prob: list = []
            print(f'end of process.')

    # Initiate iteration counter
    iterations: int = 0

    # Containers to hold f1_scores and # of pseudo-labels
    train_f1s: list = []
    pseudo_labels: list = []

    # Assign value to initiate while loop
    high_prob: list = [1]

    # Loop will run until there are no more high-probability pseudo-labels
    while len(high_prob) > 0:

        # Fit classifier and make train/test predictions
        model3.fit(X_train, y_train)
        y_hat_train: np.ndarray = model3.predict(X_train)

        # Calculate and print iteration # and f1 scores, and store f1 scores
        train_f1: float = f1_score(y_train, y_hat_train)
        print(f"Iteration {iterations}")
        print(f"Train f1: {train_f1}")
        train_f1s.append(train_f1)

        if (len(X_unlabeled) > 0):
            # Generate predictions and probabilities for unlabeled data
            print(f"Now predicting labels for unlabeled data...")

            pred_probs: np.ndarray = model3.predict_proba(X_unlabeled)
            preds: np.ndarray = model3.predict(X_unlabeled)
            prob_0: list = pred_probs[:, 0]
            prob_1: list = pred_probs[:, 1]

            # Store predictions and probabilities in dataframe
            df_pred_prob: pd.DataFrame = pd.DataFrame([])
            df_pred_prob['preds']: pd.DataFrame = preds
            df_pred_prob['prob_0']: pd.DataFrame = prob_0
            df_pred_prob['prob_1']: pd.DataFrame = prob_1
            df_pred_prob.index: pd.DataFrame = X_unlabeled.index

            # Separate predictions with > 60% probability
            high_prob: pd.DataFrame = pd.concat([df_pred_prob.loc[df_pred_prob['prob_0'] > 0.50],
                                                 df_pred_prob.loc[df_pred_prob['prob_1'] > 0.50]],
                                                axis=0)
            print(
                f"{len(high_prob)} high-probability predictions added to training data.")

            pseudo_labels.append(len(high_prob))

            # Add pseudo-labeled data to training data
            X_train: pd.DataFrame = pd.concat(
                [X_train, X_unlabeled.loc[high_prob.index]], axis=0)
            y_train: pd.DataFrame = pd.concat([y_train, high_prob.preds])

            # Drop pseudo-labeled instances from unlabeled data
            X_unlabeled: pd.DataFrame = X_unlabeled.drop(index=high_prob.index)
            print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")

            # Update iteration counter
            iterations += 1
        else:
            high_prob: list = []
            print(f'end of process.')

    X_train['prediction_supervise']: pd.DataFrame = y_train
    X_train['index']: pd.DataFrame = X_train.index
    df_final: pd.DataFrame = pd.merge(
        left=df, right=X_train, left_on='index', right_on='index')
    df_final: pd.DataFrame = df_final[['art_id', 'prediction_supervise_y']]
    df_final: pd.DataFrame = df_final.rename(
        columns={'prediction_supervise_y': 'prediction_supervise'})

    return df_final

In [None]:
column_name: list = list(bow)
column_name: list = column_name[2:]

In [None]:
df_label_gestion: pd.DataFrame = create_label(
    df, column_name, 'gestion', 'art_lemma', 'art_title')


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))


Iteration 0
Train f1: 0.9944055944055944
Now predicting labels for unlabeled data...
287 high-probability predictions added to training data.
5543 unlabeled instances remaining.

Iteration 1
Train f1: 0.9960079840319362
Now predicting labels for unlabeled data...
84 high-probability predictions added to training data.
5459 unlabeled instances remaining.

Iteration 2
Train f1: 0.9953959484346224
Now predicting labels for unlabeled data...
40 high-probability predictions added to training data.
5419 unlabeled instances remaining.

Iteration 3
Train f1: 0.9968930315135375
Now predicting labels for unlabeled data...
30 high-probability predictions added to training data.
5389 unlabeled instances remaining.

Iteration 4
Train f1: 0.9965397923875432
Now predicting labels for unlabeled data...
16 high-probability predictions added to training data.
5373 unlabeled instances remaining.

Iteration 5
Train f1: 0.9965870307167235
Now predicting labels for unlabeled data...
10 high-probability pre

In [None]:
df_label_innovation: pd.DataFrame = create_label(
    df, column_name, 'innovation', 'art_lemma', 'art_title')


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))





This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=7533.0), HTML(value='')))


Iteration 0
Train f1: 0.9995778809624314
Now predicting labels for unlabeled data...
1115 high-probability predictions added to training data.
3124 unlabeled instances remaining.

Iteration 1
Train f1: 0.9996908809891809
Now predicting labels for unlabeled data...
120 high-probability predictions added to training data.
3004 unlabeled instances remaining.

Iteration 2
Train f1: 0.9997008674842955
Now predicting labels for unlabeled data...
0 high-probability predictions added to training data.
3004 unlabeled instances remaining.

Iteration 0
Train f1: 0.9973013493253373
Now predicting labels for unlabeled data...
1005 high-probability predictions added to training data.
1999 unlabeled instances remaining.

Iteration 1
Train f1: 0.9980769230769231
Now predicting labels for unlabeled data...
97 high-probability predictions added to training data.
1902 unlabeled instances remaining.

Iteration 2
Train f1: 0.9981078524124881
Now predicting labels for unlabeled data...
37 high-probability 

In [None]:
df_label_innovation: pd.DataFrame = df_label_innovation.rename(
    columns={'prediction_supervise': 'innovant_prediction'})
df_label_gestion: pd.DataFrame = df_label_gestion.rename(
    columns={'prediction_supervise': 'gestion_prediction'})

In [None]:
df_final: pd.DataFrame = pd.merge(
    left=bow, right=df_label_innovation, on='art_id')
df_final: pd.DataFrame = pd.merge(
    left=df_final, right=df_label_gestion, on='art_id')

In [None]:
df_final = df_final.drop(["art_id", "art_content_clean_without_lem"], axis=1)

In [None]:
# Function for training models and save the results
def train_model(data, path_to):
    """Documentation
    Parameters:
        data: Data to train the model
        data_path: Path to save data
    """
    # Initialisation of the model
    model: XGBClassifier = XGBClassifier()
    # Train model
    model.fit(data.drop(["innovant_prediction", "gestion_prediction"],
                        axis=1), data["innovant_prediction"])
    # Save the model
    pickle.dump(model, open(path_to + "model_innovant.pkl", 'wb'))

    # Initialisation of the model
    model: XGBClassifier = XGBClassifier()
    # Train model
    model.fit(data.drop(
        ["innovant_prediction", "gestion_prediction"], axis=1), data["gestion_prediction"])
    # Save the model
    pickle.dump(model, open(path_to + "model_gamme_gestion.pkl", 'wb'))

In [None]:
# Save the results
train_model(
    df_final, '/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/')