In [1]:
"""
Created on Mon Jan 15 08:50 2020
Group 4
@authors: V.F and C.G.
"""

'\nCreated on Mon Jan 15 08:50 2020\nGroup 4\n@authors: V.F and C.G.\n'

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
new_facets = ['baggage_facet',
    'atmosphere_facet',
    'cabin_crew_facet',
    'comfort_facet',
    'food_facet',
    'not_flight_facet',
    'price_facet',
    'punctuality_facet'
              ]

In [3]:
dict_facets = {
    'baggage_facet': ['lose', 'let', 'bag', 'luggage', 'hand', 'people'],
    'atmosphere_facet': ['quiet', 'noise', 'calm', 'lounge', 'air'],
    'cabin_crew_facet': ['crew', 'cabin', 'helpful', 'professional', 'efficient', 'friendly'],
    'comfort_facet': ['seat', 'comfortable', 'legroom', 'room', 'leg', 'space', 'entertainment', 'uncomfortable'],
    'food_facet': ['food', 'meal', 'drink', 'snack', 'serve', 'beverage', 'tasty', 'wine', 'eat'],
    'not_flight_facet': ['check', 'boarding', 'customer', 'board', 'staff'],
    'price_facet': ['price', 'budget', 'cheap', 'cost', 'ticket', 'value', 'pay', 'reasonable'],
    'punctuality_facet': ['time', 'delay', 'arrival', 'arrive', 'late', 'schedule', 'departure']
}

In [4]:
glove_dict = {}
with open("../data/glove.twitter.27B.200d.txt") as file:
    # create a dictionnary that for each word gives its representation in a 200dimentionnal space
    for line in file:
        values = line.split(' ')
        glove_dict[values[0]] = np.array(values[1:]).astype('float')

In [30]:
def distance_top_word_facet(emb_dict: dict, new_facets: list, dict_facets: dict, top_word_df: pd.DataFrame):
    """Documentation

    Parameters : 
    emb_dict (dict): word as key and embedding as value
    new_facets (list): a list of new facets name
    dict_facets (dict): facet name as key and list of top describing words
    top_word_df (DataFrame): for each cluster gives the same number of top words

    Explanation:
    This function computes the cosine distance between each cluster and each facet.
    First we create a matrix in which for each top word in our vocabulary we save
    the mean of cosine distance with words in the lists describing the facets.
    The dataframe's shape is (nb_top_word, nb_facet).
    Then for each cluster we mean all the distances of the words describing it.

    Out:
    DataFrame with all the distance computed, its shape is (nb_cluster, nb_facet)


    """

    df_word_emb = pd.DataFrame()  # Dataframe with top words of facets at index,
    # embeddings dimension as columns plus a column for the facet associated
    for key, value in dict_facets.items():
        x = []
        for word in dict_facets[key]:
            x.append(emb_dict[word])
        df = pd.DataFrame(data=x, index=dict_facets[key])
        df_word_emb = df_word_emb.append(df)
        df_word_emb.loc[dict_facets[key], 'facets_label'] = key

    # all unique words in the top words lists of clusters
    voc = pd.unique(top_word_df.values.ravel('K'))

    # Dataframe with words at index and embeddings as columns
    word_emb_cluster = pd.DataFrame(
        index=[voc], columns=df_word_emb.columns[:-1])
    to_drop = []
    for word in voc:
        try:
            word_emb_cluster.loc[word, :] = glove_dict[word]
        except:
            to_drop += [word]
    word_emb_cluster.drop(to_drop, axis=0, inplace=True)

    # dataframe with cluster at index and facets's name as columns
    # and distance between them as values
    # this is the returned object
    df_dist = pd.DataFrame(
        columns=new_facets, index=range(len(top_word_df))).fillna(0)

    # Dataframe with the words in voc at index and facets at columns
    # and distance between them as values
    word_facet_dist = pd.DataFrame(
        columns=new_facets, index=voc).fillna(0).astype(float)

    for word in voc:  # calculate distances
        array = np.zeros((1, len(df_dist.columns)))
        for j in df_word_emb.index:
            try:
                vect = df_word_emb.loc[j].values[:-1]
                facet = df_word_emb.loc[j].values[-1]
                word_facet_dist.at[word, facet] += cosine_similarity(
                    [vect], word_emb_cluster.loc[word])[0][0]
            except:
                print('word not found')

    # normalize the distance by the number of word in each facet
    for facet in word_facet_dist.columns:
        word_facet_dist.loc[:, facet] = word_facet_dist.loc[:,
                                                            facet] / len(dict_facets[facet])
    
    # fill the dataframe that is returned
    for i in range(len(top_word_df.index)):
        array = np.zeros((1, len(df_dist.columns)))
        counter = 0
        for col in top_word_df.columns:
            try:
                word = top_word_df.loc[i, col]
                array = array + word_facet_dist.loc[word, :].values
                counter += 1
            except:
                print('word not found')

        try:
            # normalize by the number of word for which we have an embedding
            df_dist.loc[i, :] = array[0] / counter
        except:
            df_dist.loc[i, :] = -1
            print('all words not found')
    return df_dist

In [7]:
df_word_emb = pd.read_csv('emb_word_facet.csv', sep=',', index_col=0)

In [8]:
top_word_df = pd.read_csv('../data/g4_Kmean++_ALL_DATA_ANNOTATE_keyword_cluster.csv', sep=',', index_col=0)

In [29]:
distance_top_word_facet(glove_dict, new_facets, dict_facets, top_word_df)

  new_axis = axis.drop(labels, errors=errors)


a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a


Unnamed: 0,baggage_facet,atmosphere_facet,cabin_crew_facet,comfort_facet,food_facet,not_flight_facet,price_facet,punctuality_facet
0,5.451686,4.620694,4.542124,4.405324,4.592814,5.176321,5.077109,4.923107
1,6.300724,5.213376,5.111754,5.0499,6.828096,5.629395,5.981786,5.221892
2,6.025868,5.202011,4.817714,4.876532,5.261822,5.597331,5.407071,5.438359
3,4.973044,4.401879,4.28977,4.23571,4.130299,5.426676,5.20443,5.536078
4,5.59375,4.959468,4.869381,4.968164,4.731939,5.291621,5.352494,5.066746
5,5.871207,5.148326,5.219857,4.876137,4.979014,5.513393,5.493625,4.982257
6,5.637174,4.94863,4.369883,4.988104,4.516477,5.047021,4.978461,4.435452
7,7.205788,5.712275,5.285197,6.154499,6.146648,6.453192,6.602725,5.840295
8,5.756897,4.920598,4.574828,4.934683,4.885756,5.332568,5.279915,4.923244
9,6.961867,5.882378,5.628208,5.817566,5.715138,6.186265,6.269884,5.665463


In [None]:
tf_idf = pd.read_csv('../data/g4_Kmean++_ALL_DATA_ANNOTATE_tf_idf_unique.csv', sep=',', index_col=0)

In [None]:
tf_idf_mean = tf_idf.mean(axis=0)

In [None]:
# top_word_df = pd.read_excel('../data/g4_words_clusters.xlsx', index_col=0)

In [None]:
voc = pd.unique(top_word_df.values.ravel('K'))

In [None]:
word_emb_cluster = pd.DataFrame(index=[voc], columns=df_emb.columns[:-1])
to_drop = []
for word in voc:
    try:
        word_emb_cluster.loc[word, :] = glove_dict[word]
    except:
        to_drop += [word]
word_emb_cluster.drop(to_drop, axis=0, inplace=True)

In [None]:
df_dist = pd.DataFrame(
    columns=new_facets, index=range(len(top_word_df))).fillna(0)

In [None]:
word_facet_dist = pd.DataFrame(
    columns=new_facets, index=voc).fillna(0).astype(float)

In [None]:
for word in voc:
    array = np.zeros((1, len(df_dist.columns)))
    for j in df_word_emb.index:
        try:
            vect = df_word_emb.loc[j].values[:-1]
            facet = df_word_emb.loc[j][-1]
            word_facet_dist.at[word, facet] += cosine_similarity(
                [vect], word_emb_cluster.loc[word])[0][0] * (1.1 - tf_idf_mean[word] / max(tf_idf_mean))
        except:
            print('word not found')

In [None]:
for facet in word_facet_dist.columns:
    word_facet_dist.loc[:, facet] = word_facet_dist.loc[:, facet] / len(dict_facets[facet])

In [None]:
for i in range(len(top_word_df.index)):
    array = np.zeros((1,len(df_dist.columns)))
    counter = 0
    for col in top_word_df.columns:
        try:
            word = top_word_df.loc[i, col]
            array = array + word_facet_dist.loc[word, :].values
            counter +=1
        except:
            print('word not found')
    df_dist.loc[i, :] = array[0]