Created on Friday 8 January 2021

**Group 5 - Clustering**  
**Clustering v1**

@authors : Theo Vedis, Jérémy Johann, Damien Izard, Nour Elhouda Kired, Paule Cadrelle Massag, Jessicka Mucy-Clavier, Gabriel nathir Kassem Rojas

This code compute the dataframe containg all the article content and id and clusterizes them into an optimal number of clusters.
This number is based on the maximum value of the error based on the silhouette corresponding to the optimal value of k clusters.
Then, the program train the model on k clusters and predict labels and a title to each label.
The title is given by the nearest points of each centroids based on a metric (euclidean distance). We take the idf or bow of this articles and return the words which is as representative as possible of the cluster.

# Import libraries

In [None]:
%matplotlib inline
import nltk
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook
from google.colab import drive
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from scipy.spatial import distance
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Execute this block of code only if you wish to avoid warnings in the output
warnings.filterwarnings('ignore')

# Create link between drive and notebook

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importation of the data sets

In [None]:
data_innovation = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/Res_Innovation_semi_supervised_bow.json")
data_gestion = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Output/Innovation/Res_gestion_semi_supervised_V1.json")
data = pd.read_json("/content/drive/MyDrive/G5 Inter-Promo 2021/Données/Input/g3_BOW_v1.json")
data_2 = data_innovation.merge(data_gestion, on = "art_id").merge(data, on = "art_id")

# Functions

## First Part:
We find the number optimal of cluster, we train and we predict.

In [None]:
# Find the optimal number of clusters, k
def optimal_cluster(X: np.ndarray)->int:
  """Documentation
  Parameters:
      X: dataset containint all our datas we want to train the model with

  Out:
      (list_k[np.argmax(list_error_silhouette)]): optimal value of clusters, k
  """
  list_error_silhouette:list = []
  list_k:list = []
  K:range = range(9,26)

  # Create the range of k which are going to be used in our loop
  for k in tqdm_notebook(K):
    km:KMeans = KMeans(n_clusters=k, random_state = 42)
    y = km.fit_predict(X)
    list_k.append(k)
    # Create a list with all k values
    list_error_silhouette.append(silhouette_score(X,y))

  # Return the maximum of the error based on the silhouette corresponding to the optimal value of k
  return (list_k[np.argmax(list_error_silhouette)])

In [None]:
# Train the chosen model on X matrix with the chosen number of clusters (nb_cluster) and return the trained model for later use
def training(X: np.ndarray, nb_cluster: int)->np.ndarray:
    """Documentation
  Parameters:
      X: dataset containint all our datas we want to train the model with
      nb_cluster: number cluster

  Out:
      (list_k[np.argmax(list_error_silhouette)]): optimal value of clusters, k
  """
  # kmeans
  km:KMeans = KMeans(n_clusters=nb_cluster, random_state = 42,n_init = 10)

  # Fit the kmeans model on the data
  km.fit(X)
  
  return km

In [None]:
# Predict with the kmeans model labels on the data
def predict(X, model)->np.ndarray:
    """Documentation
  Parameters:
      X: dataset containint all our datas we want to train the model with
      model: unsupervied model

  Out:
      model.predict(X) : model prediction
  """

  return model.predict(X)

## Second Part :
We find the title of all the clusters

In [None]:
# Calculate the chosen distance (euclidean or cosine) between the values inside
def compute_distance(v:np.ndarray, km, label: int)->list:
  """Documentation
  Parameters:
      v: 
      km: 
      label:

  Out:
      distance.cosine(v,km.cluster_centers_[label])
  """
  # The cluster with label denomination and the centroid of this cluster
  return distance.cosine(v,km.cluster_centers_[label])

In [None]:
def obtain_word(list_content: list, type = "idf"):
  """Documentation
  Parameters:
      list_content:
      type:

  Out:
      temp_data.iloc[0,1] + "/" + temp_data.iloc[1,1] + "/" + temp_data.iloc[2,1]
  """
  # Vectorizes all the tags in the train set while cleaning it from stop words
  if type == "idf":
    vectorizer = TfidfVectorizer(stop_words = "english")
    X = vectorizer.fit_transform(list_content)

  # Vectorizes all the tags in the train set while cleaning it from stop words
  elif type == "bow":
    vectorizer = CountVectorizer(stop_words = "english")
    X = vectorizer.fit_transform(list_content)

  # Return an error while the type specified is not idf or bow
  else:
    print("None")
  
  # Create a word column from feature integer indices to feature name
  temp_data = pd.DataFrame(np.array(np.sum(X,axis = 0))[0], columns = ["score"])
  temp_data["word"] = vectorizer.get_feature_names()
  temp_data = temp_data.sort_values(by = ["score"], ascending = False)

  return temp_data.iloc[0,1] + "/" + temp_data.iloc[1,1] + "/" + temp_data.iloc[2,1]

In [None]:
# Get the contnt of an article except the words present in the whitelist
def whitelist_content(list_content: list, list_whitelist: list)->np.ndarray:
  """Documentation
  Parameters:


  Out:
      np.array(final_content)
  """
  final_content = []

  for content in list_content:
    temp_content = [w for w in content.split() if not w in list_whitelist]
    temp_content = " ".join(temp_content)
    final_content.append(temp_content)

  return np.array(final_content)

In [None]:
# Allows you to obtain the labels for the articles
def get_label_title(data: pd.DataFrame,whitelist):
  """Documentation
  Parameters:
      data: A dataframe containing all our datas we want to work with
      whitelist: a whitelist of words we want to avoid in labels title

  Out:
      themes: A string containing our title of the chosen label
  """
  themes = {}
  # Loop on each label indices and get the title of the chosen label
  for label in np.unique(data["prediction"]):
    sub_data = data.query("prediction == @label").drop(["prediction"],axis = 1).copy()
    sub_data["score"] = sub_data.drop(["art_id","art_content_clean_without_lem"], axis = 1).apply(compute_distance,axis = 1,raw = True, args = (km,0))
    head_content = sub_data.sort_values(by = ["score"], ascending = True).head(5)["art_content_clean_without_lem"].values
    head_content = whitelist_content(head_content,whitelist)
    themes[label] = obtain_word(head_content, type = "bow")
  return themes

## Third part :
We join the part one and the part two in one function

In [None]:
# Clusters the dataset with the optimal number of clusters and return
def predict_cluster(data: pd.DataFrame):
  """Documentation
  Parameters:
      data: A dataframe containing all our datas we want to work with
  Output:
      data[["art_id","prediction"]]: A dataframe containing our id and the title of the chosen label
      model: the model trained with the optimal number of clusters
  """
  nb_cluster = optimal_cluster(data.drop(["art_id","art_content_clean_without_lem"], axis = 1))
  print("We take " + str(nb_cluster) + " clusters.")
  model = training(data.drop(["art_id","art_content_clean_without_lem"], axis = 1),nb_cluster)
  data["prediction"] = predict(data.drop(["art_id","art_content_clean_without_lem"], axis = 1),model)
  whitelist = ["plus","tout","cette","ca","etre","dire","faut","fait","faire","donc"]
  data["prediction"] = data["prediction"].map(get_label_title(data,whitelist))
  return data[["art_id","prediction"]], model

## Fourth part :
Execution of all the functions

In [None]:
# We take out the model because we need to save it to predict later
output, model = predict_cluster(data)
output.head(5)

# For whitelist terms in the result
whitelist = ["plus","tout","cette","ca","etre","dire","faut","fait","faire","donc","art_id","entreprise","gestion","entreprises","aussi","meme","bien","ete","comme","etat","ministre","tres","encore","peut","dont","egalement","notamment","ainsi","leurs","entre"]

get_label_title(data,whitelist)

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))


We take 9 clusters.


NameError: ignored