In [None]:
"""
Created on Mon Jan 15 08:50 2020
Group 4
@authors: Group 4
"""

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import spacy

# Metric facet

In [2]:
def distance_top_word_facet(emb_dict: dict, new_facets: list, dict_facets: dict, top_word_df: pd.DataFrame):
    """Documentation

    Parameters : 
    emb_dict (dict): word as key and embedding as value
    new_facets (list): a list of new facets name
    dict_facets (dict): facet name as key and list of top describing words
    top_word_df (DataFrame): for each cluster gives the same number of top words

    Explanation:
    This function computes the cosine distance between each cluster and each facet.
    First we create a matrix in which for each top word in our vocabulary we save
    the mean of cosine distance with words in the lists describing the facets.
    The dataframe's shape is (nb_top_word, nb_facet).
    Then for each cluster we mean all the distances of the words describing it.

    Out:
    DataFrame with all the distance computed, its shape is (nb_cluster, nb_facet)


    """

    df_word_emb = pd.DataFrame()  # Dataframe with top words of facets at index,
    # embeddings dimension as columns plus a column for the facet associated
    for key, value in dict_facets.items():
        x = []
        for word in dict_facets[key]:
            x.append(emb_dict[word])
        df = pd.DataFrame(data=x, index=dict_facets[key])
        df_word_emb = df_word_emb.append(df)
        df_word_emb.loc[dict_facets[key], 'facets_label'] = key

    # all unique words in the top words lists of clusters
    voc = pd.unique(top_word_df.values.ravel('K'))

    # Dataframe with words at index and embeddings as columns
    word_emb_cluster = pd.DataFrame(
        index=[voc], columns=df_word_emb.columns[:-1])
    to_drop = []
    for word in voc:
        try:
            word_emb_cluster.loc[word, :] = glove_dict[word]
        except:
            to_drop += [word]
    word_emb_cluster.drop(to_drop, axis=0, inplace=True)

    # dataframe with cluster at index and facets's name as columns
    # and distance between them as values
    # this is the returned object
    df_dist = pd.DataFrame(
        columns=new_facets, index=range(len(top_word_df))).fillna(0)

    # Dataframe with the words in voc at index and facets at columns
    # and distance between them as values
    word_facet_dist = pd.DataFrame(
        columns=new_facets, index=voc).fillna(0).astype(float)

    for word in voc:  # calculate distances
        array = np.zeros((1, len(df_dist.columns)))
        for j in df_word_emb.index:
            try:
                vect = df_word_emb.loc[j].values[:-1]
                facet = df_word_emb.loc[j].values[-1]
                word_facet_dist.at[word, facet] += cosine_similarity(
                    [vect], word_emb_cluster.loc[word])[0][0]
            except:
                print('word not found')

    # normalize the distance by the number of word in each facet
    for facet in word_facet_dist.columns:
        word_facet_dist.loc[:, facet] = word_facet_dist.loc[:,
                                                            facet] / len(dict_facets[facet])
    
    # fill the dataframe that is returned
    for i in range(len(top_word_df.index)):
        array = np.zeros((1, len(df_dist.columns)))
        counter = 0
        for col in top_word_df.columns:
            try:
                word = top_word_df.loc[i, col]
                array = array + word_facet_dist.loc[word, :].values
                counter += 1
            except:
                print('word not found')

        try:
            # normalize by the number of word for which we have an embedding
            df_dist.loc[i, :] = array[0] / counter
        except:
            df_dist.loc[i, :] = -1
            print('all words not found')
    return df_dist

In [3]:
new_facets = ['baggage_facet',
    'atmosphere_facet',
    'cabin_crew_facet',
    'comfort_facet',
    'food_facet',
    'not_flight_facet',
    'price_facet',
    'punctuality_facet'
              ]

In [4]:
dict_facets = {
    'baggage_facet': ['lose', 'let', 'bag', 'luggage', 'hand', 'people'],
    'atmosphere_facet': ['quiet', 'noise', 'calm', 'lounge', 'air'],
    'cabin_crew_facet': ['crew', 'cabin', 'helpful', 'professional', 'efficient', 'friendly'],
    'comfort_facet': ['seat', 'comfortable', 'legroom', 'room', 'leg', 'space', 'entertainment', 'uncomfortable'],
    'food_facet': ['food', 'meal', 'drink', 'snack', 'serve', 'beverage', 'tasty', 'wine', 'eat'],
    'not_flight_facet': ['check', 'boarding', 'customer', 'board', 'staff'],
    'price_facet': ['price', 'budget', 'cheap', 'cost', 'ticket', 'value', 'pay', 'reasonable'],
    'punctuality_facet': ['time', 'delay', 'arrival', 'arrive', 'late', 'schedule', 'departure']
}

In [5]:
glove_dict = {}
with open("../data/glove.twitter.27B.200d.txt") as file:
    # create a dictionnary that for each word gives its representation in a 200dimentionnal space
    for line in file:
        values = line.split(' ')
        glove_dict[values[0]] = np.array(values[1:]).astype('float')

In [6]:
top_word_df = pd.read_csv('../data/g4_Kmean++_ALL_DATA_ANNOTATE_keyword_cluster.csv', sep=',', index_col=0)

distance_top_word_facet(glove_dict, new_facets, dict_facets, top_word_df) # exemple of how to call the function

  new_axis = axis.drop(labels, errors=errors)


word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not found
word not f

# Named entity research

In [None]:
# import csv files with entities that we want
airports = pd.read_csv('airport.csv', sep =',')
countries = pd.read_csv("countries.csv", sep = ',')
cities = pd.read_csv("cities.csv", sep = ',')
airlines_data = pd.read_csv('aircraft.csv', sep =',', index_col = 0)
aicrafts_data = pd.read_csv('airline.csv', sep =',', index_col = 0)

# we only keep the big airports
airports = airports[~airports.type.isin(['heliport', 'closed', 'ballonport', 'seaplane_base',
                                         'small_airport', 'medimum_airport'])]


In [None]:
data = pd.read_excel("../data//ALL_DATA_ANNOTATE.xlsx") # import data

In [None]:
# we use this spacy model to detect entity first
nlp = spacy.load('en_core_web_sm')

In [None]:
# we save the entities in list then we export them in a dataframe
cities_list = []
aircrafts = []
airlines = []
countries_list = []
airports_list = []

for i in range(len(data)):  # iterrate over all rows

    if comment != comment:
        continue
    comment = data["Sentence"].loc[i]
    # put Capital letters at the beginning of each words
    comment_title = comment.title()
    # put all the comment in capital letters
    comment_upper = comment.upper()

    entities = nlp(comment_title)
    cities_countries = []
    aeroports = []
    # iterrate over all entities found by the model
    for ent in entities.ents:

        if ent.label_ == 'GPE':  # GPE stand for geopolitical entity
            cities_countries.append(ent.text)

        if ent.label_ == 'FAC': # FAC stands for Facility
            aeroports.append(ent.text)
    airports_list.append(aeroports)

    # lookup in the csv files if the entities are present
    city = []
    country = []
    for entity in cities_countries:
        if entity in list(cities["name"]):
            city.append(entity)
            continue
        if entity in list(countries["name"]):
            country.append(entity)
            continue
    cities_list.append(city)
    countries_list.append(country)

    planes = []
    for plane in range(len(aicrafts_data)):
        if (aicrafts_data["name"].loc[plane]).upper() in comment_upper:
            planes.append(aicrafts_data["name"].loc[plane])
    aircrafts.append(planes)

    compagnies = []
    for compagnie in range(len(airlines_data)):
        if (airlines_data["name"].loc[compagnie]).upper() in comment_upper:
            compagnies.append(airlines_data["name"].loc[compagnie])
    airlines.append(compagnies)

In [None]:
# stock the results in a dataframe
pd.DataFrame(data=np.array([airports_list, cities_list, countries_list, aircrafts, airlines]).T,
             columns=['airport', 'city', 'country', 'aircraft', 'airlines']).to_csv('../Output/entite.csv', sep=',')