In [1]:
"""
Created on January 17
Group 4
Sub Group : Facet
@author : C.G  
"""

'\nCreated on January 17\nGroup 4\nSub Group : Facet\n@author : C.G  \n'

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from scipy.spatial import distance

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [3]:
def words_facet(file, facet):
    
    """
    
    The purpose of this function is to determine the important words for the facet entered in parameter
    
    Input
        file (String) -> filename 
        facet (String) -> Name of the facet
        
    Output:
        A score for supervised learning on facet
        A graph showing the most important words for the facet 
        
    """
    
    tf_idf = pd.read_csv('../data/g4_tfidf_'+file+'.csv',
                         sep=',', index_col=0)
    all_data_annotate_facets_group = pd.read_csv("../data/"+file+"_new_facets.csv", sep=',')
    tf_idf = tf_idf.iloc[:, 1:]
    all_data_annotate = all_data_annotate_facets_group.iloc[tf_idf.index]
    y = all_data_annotate[facet]
    y = list(y.fillna(0))
    
    clf = RandomForestClassifier(n_estimators=50, max_depth=50)
    scores = cross_val_score(clf, tf_idf, y, cv=5)
    print('Scores for facet ' + facet + ' : ', np.mean(scores))
    
    clf.fit(tf_idf, y)
    feat_importances = pd.Series(clf.feature_importances_, index=tf_idf.columns)
    feat_importances.nlargest(20).plot(kind='barh')
    plt.show()

In [4]:
def facet_for_sentence(words_predict, index, dict_facets, topic_predict, method_name, file):
    
    """
    
    This function 'automatically' assigns one or more facets to the sentences.
    
    Input
        words_predict (list) -> list of words for each cluster 
        dict_facets (dico) -> list of words for each facets
        topic_predict (list) -> 
        method_name (String) -> Name of method use to predict our cluster
        file (String) -> filename use
        
    Output:
        facet prediction
    
    """

    facets_list = []
    for i in range(len(words_predict)):
        facet = []
        for word in words_predict[i]:
            if word in dict_facets['baggage_facet']:
                facet.append('baggage')
            if word in dict_facets['atmosphere_facet']:
                facet.append('atmosphere')
            if word in dict_facets['cabin_crew_facet']:
                facet.append('cabin_crew')
            if word in dict_facets['comfort_facet']:
                facet.append('comfort')
            if word in dict_facets['food_facet']:
                facet.append('food')
            if word in dict_facets['not_flight_facet']:
                facet.append('not_flight')
            if word in dict_facets['price_facet']:
                facet.append('price')
            if word in dict_facets['punctuality_facet']:
                facet.append('punctuality')
        if len(facet) > 0:
            facets_list.append(facet)
        else:
            facets_list.append(['empty'])

    df = pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0]] * len(topic_predict))
    df.columns = [['atmosphere', 'baggage', 'cabin_crew', 'comfort',
                   'empty', 'food', 'not_flight', 'price', 'punctuality']]
    df.index = index


    for i in tqdm_notebook(range(len(topic_predict))):
        index = df.index[i]
        cluster = topic_predict[i]
        df.loc[index, [facets_list[cluster]]] = 1
    
    
    df.to_csv('../data/g4_facet_'+method_name+'_prediction_on_'+file+'.csv', sep=',')
    
    return df
    

In [5]:
def LDA_tfidf(nb_cluster, file, dict_facets):
    
    """
    
    TThis function allows you to cluster sentences with an LDA method.
    
    Input
        nb_cluster (int) -> Number of clusters 
        file (String) -> filename use
        dict_facets (dico) -> list of words for each facets        
        
    Output:
        top words for eache cluster
        a cluster prediction for each sentence
        facet prediction
    
    """
    
    tf_idf = pd.read_csv('../data/g4_tfidf_'+file+'.csv',
                          sep=',', index_col=0)
    tf_idf = tf_idf.iloc[:, 1:]
    
    columns = tf_idf.columns
    
    LDA = LatentDirichletAllocation(learning_method='batch', n_components=nb_cluster,
                                    random_state=42)
    LDA.fit(tf_idf)
    
    first_topic = LDA.components_[0]
    top_topic_words = first_topic.argsort()[-10:]
    
    cluster_number = []
    top_words_topic =  []
    for i, topic in enumerate(LDA.components_): 
        cluster_number.append(i)
        top_words_topic.append([list(columns[2:])[i]
                                for i in topic.argsort()[-15:]])

    top_words = pd.DataFrame(top_words_topic)
    top_words.to_csv('../data/LDA_top_word_for_'+str(len(cluster_number))+'_cluster.csv')
    
    topic_values = LDA.transform(tf_idf)
    topic_predict = topic_values.argmax(axis=1)
    
    cluster_prediction = pd.DataFrame(topic_predict, index = tf_idf.index, columns = ['cluster'])
    
    cluster_prediction.to_csv('../data/LDA_cluster_sentence_'+file+'.csv')
    
    
    facets_predictions = facet_for_sentence(top_words_topic, tf_idf.index, dict_facets, topic_predict, 'LDA', file)
    
    return top_words, cluster_prediction, facets_predictions

In [6]:
def nb_doc_per_cluster(cluster: np.array):
    
    """
    Documentation:
        This function give the n_terms most important word from each cluster

    Parameters:
        cluster: array from model.labels_

    Out :
        print the nb of document in each cluster

    """

    nbCluster = len(np.unique(cluster))
    somme = 0
    for num_cluster in range(nbCluster):
        nb_element = list(cluster).count(num_cluster)
        print('Cluster ' + str(num_cluster) + ' has : ' +
              str(nb_element) + ' documents')
        somme += nb_element

    print('Total : ' + str(len(cluster)) +
          ' element on ' + str(somme))
    
    
def one_word_per_cluster(clusters: MiniBatchKMeans, matrix_tfidf: pd.DataFrame) -> pd.DataFrame:
    
    """
    Documentation:
        This function return a DataFrame which represent the weight of the words in each cluster.
        A word can only be in an unique cluster.

    Parameters:
        clusters: element given by a clustering fonction (here MiniBatchKMeans)
        matrix_tfidf: a tf_idf DataFrame Doc X Word

    Out :
        cluster_center: a DataFrame cluster X Word with one score per word (a word is in an unique cluster)

    """

    cluster_center = pd.DataFrame(clusters.cluster_centers_)
    cluster_center.columns = matrix_tfidf.columns

    # We run cluster_center (for each word (column), we check all clusters (row))
    for i in cluster_center.columns:

        # If the presence of a word is not in the majority in a cluster compared to the others -> Delete column
        if (np.array(cluster_center[i].sort_values(ascending=False))[0] >
                2 * np.array(cluster_center[i].sort_values(ascending=False))[1]) == False:
            cluster_center = cluster_center.drop(columns=i)

        # Else we put 0 to the weights for the clusters out of the majority
        else:
            maxi = cluster_center[i].max()
            cluster_center.loc[cluster_center[i] != maxi, i] = 0

    return cluster_center


def get_keyword(cluster_center_unique: pd.DataFrame, nb_keyword: int) -> list:
    
    """
    Documentation:
        This function give the most nb_keyword representative words of each cluster

    Parameters:
        cluster_center_unique: a DataFrame cluster X Word with one score per word
        nb_keyword: number of keywords wanted per cluster

    Out :
        list_key_word: list of the list of nb_keyword for each cluster


    N.B. :
        We could calculate the weight of each word and only keep the words with the most important weight.
        Not simply the n most importante (some cluster could have only one or two important word)
    """

    # We run the new tf-idf (1 value per column <=> a word is only in one cluster)
    list_key_word = []
    for i, r in cluster_center_unique.iterrows():
        # We collect the most present nb_words and their weights
        keywords = [cluster_center_unique.columns[t]
                    for t in np.argsort(r)[-nb_keyword:]]
        
        #keywords_weight = [r.sort_values(ascending=False)[keywords[t]] for t in range(nb_word)]
        keywords.reverse()
        # keywords_weight.reverse()
        list_key_word.append(keywords)

    return list_key_word

In [7]:
def attrib_facet_to_cluster(keyword_per_cluster: pd.DataFrame,atmosphere_facet,
                            baggage_facet, cabin_crew_facet, comfort_facet,
                            food_facet, not_flight_facet, price_facet,punctuality_facet
                            ) -> pd.DataFrame:
    """
    Documentation:
        This function attrib the facets to the clusters.
        It create a link between the keyword list and the facets_lists.

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster: number of cluster to test
        atmosphere_facet: list of the words referenced a facet

    Out :
        pd.DataFrame cluster X facette (on column)

    """

    facets_list = []

    for i in keyword_per_cluster.index:
        facette = []
        for word in keyword_per_cluster.loc[i]:
            if word in baggage_facet:
                facette.append('BAGGAGE')
            if word in atmosphere_facet:
                facette.append('ATMOSPHERE')
            if word in cabin_crew_facet:
                facette.append('CABIN_CREW')
            if word in comfort_facet:
                facette.append('COMFORT')
            if word in food_facet:
                facette.append('FOOD')
            if word in not_flight_facet:
                facette.append('NOT_FLIGHT')
            if word in price_facet:
                facette.append('PRICE')
            if word in punctuality_facet:
                facette.append('PUNCTUALITY')

        if len(facette) > 0:
            facets_list.append(facette)
        else:
            facets_list.append(['EMPTY'])

    # facet duplicate suppression
    for i in range(len(facets_list)):
        liste = list(set(facets_list[i]))
        facets_list[i] = liste

    return pd.DataFrame(facets_list)




def attrib_facet_to_doc(pd_rep_facet_to_cluster: pd.DataFrame, list_facets: list,
                        index: list, rep_cluster: np.array) -> pd.DataFrame:
    """
    Documentation:
        This function attrib the facets to the documents.
        It create a link between the DataFrame return by attrib_facet_to_cluster
        and the repartition of the document into the clusters return by cluster.labels_

    Parameters:
        pd_rep_facet_to_cluster:
        list_facets:
        index:
        rep_cluster:

    Out :
        pd.DataFrame doc X facette (on column)

    """

    rep_doc_to_facet = pd.DataFrame(0, index=index, columns=list_facets)
    for cluster_doc in rep_cluster :
        row = pd_rep_facet_to_cluster.loc[cluster_doc]
        for facet in row:
            if facet is not None:
                rep_doc_to_facet.loc[cluster_doc][facet] = 1
                
    return rep_doc_to_facet



def MSE(nb_cluster_max: int, matrix_tfidf: pd.DataFrame) -> print:
    """
    Documentation:
        This function calculate the MSE Criterion to estimate the number of cluster

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster_max: number of cluster to test

    Out :
        plot of the MSE result

    """

    n_components = np.arange(1, nb_cluster_max)
    models = [MiniBatchKMeans(n_clusters=n, random_state=42).fit(
        matrix_tfidf) for n in n_components]
    plt.plot(n_components, [m.inertia_ for m in models], label='MSE')
    plt.legend(loc='best')
    plt.xlabel('n_components')

# Purity

In [8]:
def purity(df, col_cluster, col_label):
    """Documentation

    Parameters :
    df (DataFrame) : Matrix of coordinates of comments
    clo_cluster (String) : Name of column of clusters
    col_label (String) : Name of column of index

    Attributes :
    moy (Int) : Mean
    nb_cluster (Int) : Number of clusters

    Out : moy
    """
    moy = 0
    nb_cluster = len(df[col_cluster].unique())
    for i in np.unique(df[col_cluster]):
        # recover comments of each cluster
        rows_in_cluster = df[df[col_cluster] == i][col_label]
        moy += rows_in_cluster.sum(axis=0).nlargest(n=1).sum() / \
            rows_in_cluster.sum().sum()
    return moy/nb_cluster


def purity_LDA(matrix_tfidf: pd.DataFrame, nb_clusters: int) -> print:
    """
    Documentation:
        This function calculate the purity score of clusters for different number of cluster and
        plot the result.

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster_max: number of cluster to test

    Out :
        The different result of purity for the different number of clusters
    """

    all_data_annotate_facets_group = pd.read_csv(
        "../data/ALL_DATA_ANNOTATE_new_facets.csv", sep=',')
    metric_all_data_annotate_facets_group = all_data_annotate_facets_group.iloc[:, 1:]

    LDA = LatentDirichletAllocation(learning_method='batch', n_components=nb_clusters,
                                    random_state=42)
    LDA.fit(matrix_tfidf)
    topic_values = LDA.transform(matrix_tfidf)
    topic_predict = topic_values.argmax(axis=1)

    df = pd.DataFrame(topic_predict, index=matrix_tfidf.index,
                      columns=['cluster'])

    metric_all_data_annotate_facets_group2 = metric_all_data_annotate_facets_group.loc[
        df.index]
    metric_all_data_annotate_facets_group2['cluster'] = df["cluster"]

    return purity(metric_all_data_annotate_facets_group2, 'cluster',
                  ['atmosphere', 'baggage', 'cabin_crew', 'comfort', 'empty', 'food', 'not_flight', 'price', 'punctuality'])


def purity_Kmeans_pp(matrix_tfidf: pd.DataFrame, nb_clusters: int) -> print:
    """
    Documentation:
        This function calculate the purity score of clusters for different number of cluster and
        plot the result.

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster_max: number of cluster to test
        index: list of index

    Out :
        The different result of purity for the different number of clusters
    """

    all_data_annotate_facets_group = pd.read_csv(
        "../data/ALL_DATA_ANNOTATE_new_facets.csv", sep=',')
    metric_all_data_annotate_facets_group = all_data_annotate_facets_group.iloc[:, 1:]

    clusters = MiniBatchKMeans(
        n_clusters=nb_clusters, random_state=42).fit(matrix_tfidf)

    df = pd.DataFrame(clusters.labels_,
                      index=matrix_tfidf.index, columns=['cluster'])

    metric_all_data_annotate_facets_group2 = metric_all_data_annotate_facets_group.loc[
        df.index]
    metric_all_data_annotate_facets_group2['cluster'] = df["cluster"]

    return purity(metric_all_data_annotate_facets_group2, 'cluster',
                  ['atmosphere', 'baggage', 'cabin_crew', 'comfort', 'empty', 'food', 'not_flight', 'price', 'punctuality'])

def purity_Kmeans(data, nb_cluster, index):
    """
    Documentation:
        This function calculate the purity score of clusters for different number of cluster and
        plot the result.

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster_max: number of cluster to test
        index: list of index
    Out :
        The different result of purity for the different number of clusters
    """
    all_data_annotate_facets_group = pd.read_csv(
        "../data/ALL_DATA_ANNOTATE_new_facets.csv", sep=',')
    metric_all_data_annotate_facets_group = all_data_annotate_facets_group.iloc[:, 1:]
    metric_all_data_annotate_facets_group2 = metric_all_data_annotate_facets_group.loc[index]

    kmeans = KMeans(n_clusters=nb_cluster, random_state=42,
                    n_init=30).fit(data)  # create clusters
    rep_cluster = kmeans.labels_  # list of clusters of comments
    metric_all_data_annotate_facets_group2['cluster'] = rep_cluster

    return purity(metric_all_data_annotate_facets_group2,
                  'cluster', ['atmosphere', 'baggage', 'cabin_crew', 'comfort', 'empty', 'food', 'not_flight', 'price', 'punctuality'])

# Silhouette

In [9]:
def silouhette_LDA(nb_clusters: int, matrix_tfidf: pd.DataFrame) -> print:
   
    """
    Documentation:
        This function calculate the silouhette

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster : number of cluster to test

    Out :
        silhouette result

    """
    
    LDA = LatentDirichletAllocation(learning_method='batch', n_components=nb_clusters,
                                    random_state=42)
    LDA.fit(matrix_tfidf)
    topic_values = LDA.transform(matrix_tfidf)
    topic_predict = topic_values.argmax(axis=1)
    
    return metrics.silhouette_score(matrix_tfidf, topic_predict)


def silouhette_Kmeans_pp(nb_clusters: int, matrix_tfidf: pd.DataFrame) -> print:
   
    """
    Documentation:
        This function calculate the silouhette

    Parameters:
        matrix_tfidf: a tf_idf DataFrame Doc X Word
        nb_cluster : number of cluster to test

    Out :
        silhouette result

    """
  
    clusters = MiniBatchKMeans(n_clusters=nb_clusters, random_state=42).fit(matrix_tfidf)
    return metrics.silhouette_score(matrix_tfidf, clusters.labels_)



def silhouette_Kmeans(nb_clusters: int, matrix: pd.DataFrame) -> print:
    """Documentation

    Parameters :
    nb_cluster (Int) : Numbers of clusters
    matrix (DataFrame) : Matrix of coordinates of comments

    Attributes :
    res (list) : list of clusters
    clusters (clusters) : 
    """
    clusters = KMeans(n_clusters=nb_clusters, random_state=42,
                      n_init=30).fit(matrix)  # create clusters
    return metrics.silhouette_score(matrix, clusters.labels_)


In [10]:
def database_read(file, weighted=False):
    """Documentation

    Parameters :
    file (String): Name of file 

    Attributes :
    data (DataFrame) : Data
    index (DataFrame) : Index
    comments (DataFrame) : All clean comments

    Out : data, index, comments 
    """
    if not weighted:
        data = pd.read_csv("../data/g4_embedding_" +
                           file + ".csv")  # import data
        data = data.dropna()  # drop empty comments
        index = np.array(data['index'].astype(int))  # recover index
        comments = np.array(data['commentaire'])  # recover comments
        del data['index']  # delete index column
        del data['commentaire']  # delete comments column
    else:
        data = pd.read_csv("../data/g4_weight_embedding_" +
                           file + ".csv", index_col=1)  # import data
        data_for_comments = pd.read_csv(
            "../data/g4_embedding_" + file + ".csv")
        data = data.dropna()  # drop empty comments
        index = data.index  # recover index
        data = data.iloc[:, 1:]
        # recover comments
        comments = np.array(data_for_comments['commentaire'].loc[data.index])

    return data, index, comments



def cluster_embedding(data, index, comments, nb_clusters):
    """Documentation

    Parameters :
    data (DataFrame) : Data
    index (DataFrame) : Index
    comments (DataFrame) : All clean comments 

    Attributes :
    kmeans 
    rep_cluster
    distances (Array) : List of distances
    coor_cluster (Array) : Gravity center of cluster
    coor_comm (Array) : Coordinates of comment
    dist_eucli (Int) : Distance between coor_cluster and coor_comm
    matrix (DataFrame) : Table with index, comments, number of clusters and distances
    clust(i) (dataFrame) : 5 more closed comments to the cluster i

    Out : matrix
    """

    kmeans = KMeans(n_clusters=nb_clusters, random_state=42,
                    n_init=20).fit(data)  # create clusters

    rep_cluster = kmeans.labels_  # list of clusters of comments

    distances = []

    for i in range(len(data)):
        try:
            coor_cluster = kmeans.cluster_centers_[
                rep_cluster[i]]  # gravity center of cluster
            coor_comm = np.array(data.iloc[i])
            try:
                # distance between gravity center of cluster and comment
                dist_eucli = distance.euclidean(coor_cluster, coor_comm)
            except:
                print(i)
        except:
            print(i)
        distances.append(dist_eucli)  # append this distance

    # concatenate comments and distances
    matrix = np.concatenate(
        ([index], [comments], [rep_cluster], [distances]), axis=0)

    matrix = pd.DataFrame(matrix).T
    matrix.columns = ['index', 'commentaire',
                      'cluster', 'distance']  # add columns names

    v = locals()
    words = []

    for i in range(nb_clusters):

        # all comments of cluster i
        v['clust%d' % i] = matrix[matrix['cluster'] == i]

        # 5 more closed to center gravity of cluster i
        v['clust%d' % i] = v['clust%d' % i].sort_values(by='distance')[0:10]

        vectorizer = TfidfVectorizer(stop_words="english", min_df=0.1)
        commentary = v['clust%d' % i]['commentaire']
        commentary = commentary.dropna()
        X = vectorizer.fit_transform(commentary)  # creating tf-idf matrix

        # transforming in data frame
        M = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

        words.append(M.sum().sort_values(ascending=False)[0:10].index)

        # M.to_excel("../data/g4_tfidf_clust" + str(i) + ".xlsx") # save matrix

    pd.DataFrame(words).to_excel("../data/g4_words_clusters.xlsx")

    return matrix, words







In [11]:
def features_labels_facet(tf_idf,facet,metric):
    """ 
    Documentation:
    this function adds the label in the dataframe tf_idf 
    by putting 1 if the document talks about the label and 0 if not. 
    
    Parameters:
        tf_idf: dataframe tf_idf
        facet: one facet
        metric : grouping facet metric
    
    Output:
        result: features et labels
    """   
    
    tf_idf[facet]=0
    
    for ind in metric.index:
        if metric.loc[ind,[facet]][0]==1:
            tf_idf.loc[ind,[facet]]=1
      
    tf_idf=tf_idf.fillna(0).astype(int)
    tf_idf=tf_idf.sort_index(axis=0)
       
    labels=np.array(tf_idf[facet])
    tf_idf=tf_idf.drop(facet,axis=1)
    features= np.array (tf_idf)
  
    return features,labels

def facet_score(tf_idf,facet,metric):
    """ 
    Documentation:
    this function calculates the prediction score of a facet using the random forest classifier
 
    
    Parameters:
        tf_idf: dataframe tf_idf
        facet: one facet
        metric : grouping facet and metric
    
    Output:
        result: predictive facet score
    """     
    
    features,labels=features_labels_facet(tf_idf,facet,metric)
    
    train_features, test_features, train_labels, test_labels = train_test_split (features, labels, test_size = 0.25, random_state = 42)
    
    rf = RandomForestClassifier (n_estimators = 100, random_state = 42)
    
    rf.fit (train_features, train_labels)
        
    return rf.score(features,labels)
    

In [12]:
def get_dict_cluster(tf_idf, n_clusters,n_terms):
    """ 
    Documentation:
        this function associates n_clusters clusters , 
        their n_terms comments of strong weight with the kmeans classification  
        
    Parameters:
        tf_idf: dataframe tf_idf
        n_clusters: the number of clusters.
        n_terms : the n terms with the highest cluster weight
    
    Output:
        result: dictionary of the n_clusters clusters with each one its n_terms strongest comments
    """
    
    clusters = MiniBatchKMeans(n_clusters=n_clusters, random_state=42).fit_predict(tf_idf)
    df = pd.DataFrame(tf_idf.to_dense()).groupby(clusters).mean()
    dict_cluster={}
    dict_max={}
    
    for i in range(n_clusters):
        dict_max[i+1]=[]
        dict_cluster[i+1]=[]
    
    for col in df.columns:
        for ind in df.index:
            if df.loc[ind,[col]][0]==max(df[col]):
                dict_max[ind+1].append(col)
                break
        
    for i, r in df.iterrows():
        n=1
        len_dm=len(dict_max[i+1])
        m=n_terms
        if len_dm<n_terms:
            m=len_dm
        while len(dict_cluster[i+1])<m:
            t=np.argsort(r)[-n]
            if tf_idf.columns[t] in dict_max[i+1]:
                dict_cluster[i+1].append(tf_idf.columns[t])
            n+=1  
            
    return dict_cluster

def transform_list_doc (comments,index_document):
    """ 
    Documentation:
        this function turns a document into a list of comments
    
    Parameters:
        commentary: commentary from 'OTHER_DATA_ANNOTATE.xlsx'
        index_doc: index of commentary

    
    Output:
        result: 
    """ 
    comment=comments.loc[index_document]
    comment=str(comment)
    comment=comment.replace(",","")
    comment=comment.split()
    comment=comment[:-4][1:]
    
    return comment

def cluster_belong_document(dict_cluster,comments):
    """ 
    Documentation:
        this function associates each cluster with a document list.    
   
   Parameters:
        dict_cluster: dictionary of cluster
        comments: commentary from 'OTHER_DATA_ANNOTATE.xlsx'
        
    Output:
        result: index of commentary and the clusters where they belong
        
    """
    dict_cluster_doc={}
    for i in dict_cluster.keys():
        dict_cluster_doc[i]=[]
        
    
    for ind in comments.index:
        comment=transform_list_doc(comments,ind)
        for key, val in dict_cluster.items():
            for com in comment:
                if com in dict_cluster[key]:
                    dict_cluster_doc[key].append(ind)
                    break

    
    return dict_cluster_doc

def comment_surround_facet(facet,comments):
    """ 
    Documentation:
        this function returns the list of comments surrounding a given facet

    
    Parameters:
        facet: one facet
        comments: comments from 'OTHER_DATA_ANNOTATE.xlsx'
        
    Output:
        result: list of comments surrounding a facet
        
    """
    list_comment=[]
    for i in comments.index:
        comment=transform_list_doc(comments,i)
        n=len(comment)
        for i in range(n):      
            if comment[i] ==facet.lower() :
                if i>0:
                    list_comment.append(comment[i-1])
                if i<n-1:
                    list_comment.append(comment[i+1])
            else :
                for fac in facet.split():
                    if comment[i] ==fac :
                        if i>0:
                            list_comment.append(comment[i-1])
                        if i<n-1:
                            list_comment.append(comment[i+1])
    return list_comment


def comment_frequent_facet(facet,comments):
    """ 
    Documentation :
        this function returns a list of the most frequent comments among the comments surrounding the facet
    
    Parameters:
        facet: one facet
        comments: comments from 'OTHER_DATA_ANNOTATE.xlsx'
        
    Output:
        result: list of the most frequent comments surrounding a facet
        
    """
    
    list_facet=comment_surround_facet(facet.lower(),comments)
    comment_count=[]
            
    dict_facet={}
    for s in list_facet:
        if s not in comment_count:
            dict_facet[s]=list_facet.count(s)
            comment_count.append(s)
    
    list_comment_frequent=[c for c,v in dict_facet.items() if v==max(dict_facet.values())]

    return list_comment_frequent


def cluster_belong_facet(dict_cluster,comments,facets,dict_facets_ref):
    
    """ 
    Documentation:
        this function associates each cluster with a facet list    
    
    Parameters:
        dict_cluster: dictionary of cluster
        comments: comments from 'OTHER_DATA_ANNOTATE.xlsx'
        facets : list of facet
        dict_facets_ref: comments referring to the facet
        
    Output:
        result: dictionary whose keys are clusters and values the facets
        
    """
    dict_cluster_facet={}
    for i in dict_cluster.keys():
        dict_cluster_facet[i]=[]
        
        
    for facet in facets:
        list_comment_freq_facet=comment_frequent_facet(facet,comments)
        list_ref_facet=dict_facets_ref[facet]
        for key, value in dict_cluster.items():
                for val in value:
                    if facet.lower()==val:
                        dict_cluster_facet[key].append(facet)
                        break
                    elif val in list_comment_freq_facet:
                        dict_cluster_facet[key].append(facet)
                        break
                    elif val in list_ref_facet:
                        dict_cluster_facet[key].append(facet)
                        break

    return dict_cluster_facet

def df_doc_facet(facets,comments,dict_facets_ref,tf_idf,dict_cluster):
    """ 
    Documentation:
        this function creates the document and facet dataframe. 
        For each cluster, attribute 1 the facets and documents contained in it.
        
    Parameters:
        facets: facets
        comments: comments from 'OTHER_DATA_ANNOTATE.xlsx'
        dict_facets_ref: comments referring to the facet
        tf_idf : dataframe tf_idf
        
    Output:
        result: dataframe document and facet
        
    """ 
    
    dict_cluster_facet=cluster_belong_facet(dict_cluster,comments,facets,dict_facets_ref)
    dict_cluster_doc=cluster_belong_document(dict_cluster,comments)
    
    
    dim = (tf_idf.shape[0], len(facets))
    df_facet = np.zeros(dim, dtype='int')
    df_facet = pd.DataFrame(df_facet, index=tf_idf.index, columns=facets)
    
    for key_facet,val_facet in dict_cluster_facet.items():
        for key_doc, val_doc in dict_cluster_doc.items():
            if key_facet==key_doc:
                for vf in val_facet:
                    for vd in val_doc:
                        df_facet.loc[vd,[vf]]=1

                    
    df_facet=df_facet.rename(columns={'CABIN CREW':'CABIN_CREW','LAVATORY SPACE':'LAVATORY_SPACE','LOST BAGGAGE':'LOST_BAGGAGE','CHECK IN':'CHECK_IN','ATTRACTIVE':'ATTRACTIVE_AIRCRAFT'})
    df_facet.columns=[c.lower() for c in df_facet.columns]
    return df_facet


def df_doc_new_facet(facets,comments,dict_facets_ref,tf_idf,dict_cluster):
    """ 
    Documentation:
        this function creates the document and news facets dataframe. 
        For each cluster, attribute 1 the facets and documents contained in it.
        
    Parameters:
        facets: facets
        comments: comments from 'OTHER_DATA_ANNOTATE.xlsx'
        dict_facets_ref: comments referring to the facet
        tf_idf : dataframe tf_idf
        
    Output:
        result: dataframe document and facet
        
    """ 
    
    dict_cluster_facet=cluster_belong_facet(dict_cluster,comments,facets,dict_facets_ref)
    dict_cluster_doc=cluster_belong_document(dict_cluster,comments)
    
    
    dim = (tf_idf.shape[0], len(facets))
    df_facet = np.zeros(dim, dtype='int')
    df_facet = pd.DataFrame(df_facet, index=tf_idf.index, columns=facets)
    
    df_facet['EMPTY']=1
    
    for key_facet,val_facet in dict_cluster_facet.items():
        for key_doc, val_doc in dict_cluster_doc.items():
            if key_facet==key_doc:
                for vf in val_facet:
                    for vd in val_doc:
                        df_facet.loc[vd,[vf]]=1
                        df_facet.loc[vd,['EMPTY']]=0

                    
    df_facet=df_facet.rename(columns={'CABIN CREW':'CABIN_CREW'})
    df_facet.columns=[c.lower() for c in sorted(df_facet.columns)]
    
    
    return df_facet


def create_dict_new_facet(facets):
    
    """ 
    Documentation:
        this function creates a dictionnary by regrouping old facets
        
    Parameters:
        facets: facets
        
    Output:
        result: dictionnary 
        
    """ 
    
    
    # The facets
    facets = ['SEAT','BED','IFE','FOOD','NOISE','TEMPERATURE','HUMIDITY','CABIN CREW','LAVATORY SPACE','PRICE',
              'LOST BAGGAGE','CHECK IN','PUNCTUALITY','ATTRACTIVE','SAV','BOARDING','GENERAL']

    #comments referring to the facet
    dict_facets_ref={
        facets[0]:['foot','leg','legroom','room','window','space','seating','place','spacious'],
        facets[1]:['sleep','flat'],
        facets[2]:['screen','entertainment','movie'],
        facets[3]:['drink','meal','vegetarian','water','tray','table'],
        facets[4]:['bad','terrible','horrible'],
        facets[5]:['cold','heat','warm'],
        facets[6]:['moisture','damp','wet'],
        facets[7]:['crew','cabin','english','spanish','entertainment','class',],
        facets[8]:['aisle','toilet'],
        facets[9]:['poor','budget','pay','value','fee','expensive'],
        facets[10]:['lost','missing','bag','luggage','baggage'],
        facets[11]:['check','log','register'],
        facets[12]:['time','hour','minute','year','delay'],
        facets[13]:['good','great','wonderful','excellent','efficient','need','comfort','comfortable','fine','cool','sweet','quality','amazing','nice'],
        facets[14]:['service','information','communication'],
        facets[15]:['terminal','board','departure','departure','leave','gate'],
        facets[16]:['member','professional','airliner','chair','customer','trip','agent','company','travel','passenger','fly','aircraft','airplane','airport','plane','staff']
    }
    
    
    dict_facets_new_ref={}
    for i in range(len(facets_new)):
        dict_facets_new_ref[facets_new[i]]=[]

    for i in [0,1,2,8]:
        for fac in dict_facets_ref[facets[i]]:
            dict_facets_new_ref[facets_new[0]].append(fac)

    for fac in dict_facets_ref[facets[10]]:
        dict_facets_new_ref[facets_new[1]].append(fac)

    for fac in dict_facets_ref[facets[7]]:
        dict_facets_new_ref[facets_new[2]].append(fac)

    for i in [4,5,6]:
        for fac in dict_facets_ref[facets[i]]:
            dict_facets_new_ref[facets_new[3]].append(fac)

    for fac in dict_facets_ref[facets[16]]:
            dict_facets_new_ref[facets_new[4]].append(fac)

    for fac in dict_facets_ref[facets[3]]:
        dict_facets_new_ref[facets_new[5]].append(fac)

    for i in [11,15,14]:
        for fac in dict_facets_ref[facets[i]]:
            dict_facets_new_ref[facets_new[6]].append(fac)

    for fac in dict_facets_ref[facets[9]]:
            dict_facets_new_ref[facets_new[7]].append(fac)

    for fac in dict_facets_ref[facets[12]]:
        dict_facets_new_ref[facets_new[8]].append(fac)

    return dict_facets_new_ref

# END