In [None]:
"""
    Created on 16 January 2020
    Group 4
    Authors : Facet group
"""

# Imports

In [1]:
import pandas as pd
import string
import numpy as np
import sklearn
import spacy
import re

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import remove_stopwords

import scipy

from langdetect import detect

from tqdm import tqdm_notebook

import time

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
dico = {}
with open("../data/glove.twitter.27B.200d.txt", encoding="utf8") as file:
    for line in file:
        values = line.split(' ')
        dico[values[0]] = np.array(values[1:]).astype('float')

# Functions

In [None]:
#To read data

def database(file, column, extension='xlsx'):
    """Function importing data from xlsx file and returning its comments
    
    Parameters:
        file : file to import (xlsx spreadsheet)
        column : column name of the comments you want to get from your file
        
    Attributes:
        data : array containing all data from your file
        sentences : array containing comments from your file that are located to the column
                    and the number (index) associated to each comment in your file
        
    Out:
        sentences
    """
    if extension == 'xlsx':
        data = pd.read_excel("../data/" + file + ".xlsx") # import data
    elif extension == 'csv':
        data = pd.read_csv("../data/" + file + ".csv", sep=',') # import data
    else:
        print('This function does not take into account this type of file, \
               be sure to use a csv or xlsx format')
    data['index'] = range(len(data))
    data= data.drop_duplicates(subset=[column], keep='first')
    sentences = data.loc[:,lambda data: ["index", column]]
    return sentences

In [None]:
#To clean data

def delete_url(sentence):
    """ 
    This function deletes url from comments
    Parameters: 
        sentences
    
    Out
        sentences without url
    """
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', str(sentence))

def delete_points(sentence):
    """
    Delete Points positifs or negatifs from comments (to avoid a wrong langage detection)
    Parameters: 
        sentences
    
    Out
        sentences without points**
    """
    return sentence.replace('Points positifs', ' ').replace('Points négatifs', ' ')

def delete_verified(sentence):
    """
    This function deletes Verified from comments
    Parameters: 
        sentences
    
    Out
        sentences without *verified*
    
    """
    return sentence.replace('Trip Verified', ' ').replace('Not Verified',' ').replace('Verified Review',' ')

def treating_hashtag(sentence):
    """
    Double the words following hashtag from comments
    Parameters: 
        sentences
    
    Out
        sentences without hashtag
    """
    return sentence + ' ' + ' '.join(re.findall(r"#(\w+)", str(sentence)))

def treating_at_sign(sentence):
    """
    Double the words following at sign from comments
    Parameters: 
        sentences
    
    Out
        sentences without at sign
    """
    return sentence + ' ' + ' '.join(re.findall(r"#(\w+)", str(sentence)))

def clean_data_apply(sentence, nlp_en):
    sentence = delete_points(sentence) # deleting "points négatifs et positifs" from comments
    sentence = delete_verified(sentence) # deleting "Trip Verified/Not Verified" and "Verified Review" from comments
    sentence = delete_url(sentence) # deleting all url/ hypelink from comments
    sentence = treating_hashtag(sentence) # doubling hashtags in every comments to add them importance
    sentence = treating_at_sign(sentence) # doubling @ (twitter / instagram username) to add them importance
    sentence = strip_punctuation(sentence) # delete punctuation
    clean_sentence = ''
    comments = nlp_en(sentence.lower()) # lowering comments
    if len(comments) != 0: # checking if current comment is not empty
        try :
            if detect(str(comments)) == 'en': # checking if the current comment is in English
                for token in comments:
                    clean_sentence = clean_sentence + token.lemma_ + ' ' # cleaning comments (infinitive verbs and root words)
                clean_sentence = clean_sentence.replace('-PRON-', '') # deleting pronoun from comments
                clean_sentence = remove_stopwords(strip_short(clean_sentence)) # deleting short words
        except :
            print('Comment not English')
    else:
        print('Comment empty')
    
    return clean_sentence

In [2]:
def create_tfidf_apply(col_com, file, mindf = 5, maxdf = 0.7):
    """Function creating the tf-idf matrix from cleaned sentences and creating a csv file

    Parameters:
    col_com : pandas Series containing cleaned comments
    file : name of your csv file

    Attributes:
    vectorizer : sklearn function computing tf idf
    X : creation of tf-idf matrix
    M : transfroming X in dataframe

    """

    vectorizer = TfidfVectorizer(stop_words="english",min_df=mindf, max_df=maxdf) # selecting set of words
    X = vectorizer.fit_transform(col_com) #creating tf-idf matrix

    M = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names()) # transforming in data frame
    M.set_index(col_com.index).to_csv("../data/g4_tfidf_" + file + ".csv", sep = ",") # saving matrix
    
    print(file + ' TF-IDF Matrix saved')

    
def create_tf(sentences_clean, file):
    """Function creating the tf matrix from cleaned sentences and creating a csv file

    Parameters:
    sentences_clean : array containing cleaned comments
    and the number associated to each comment
    file : name given to your csv file

    Attributes:
    comments : comments from the sentences_clean array
    index : indexes from the sentences_clean array
    vectorizer : set of all words
    X : creation of tf-idf matrix
    M : transfroming X in dataframe
    tfidf : tf-idf matrix with cleaned comments
    col : new column names

    """
    
    comments = [i[1] for i in sentences_clean]
    index = [i[0] for i in sentences_clean]  

    vectorizer = CountVectorizer(stop_words="english",min_df=5, max_df=0.7) # selecting set of words
    X = vectorizer.fit_transform(comments) #creating tf-idf matrix

    M = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names()) # transforming in data frame

    tfidf = np.concatenate((pd.DataFrame(index), pd.DataFrame(comments), M), axis=1) # add comments to tfidf
    col = vectorizer.get_feature_names()
    col = ['index','commentaire'] + col # renaming columns

    pd.DataFrame(tfidf, columns = col).set_index('index').to_csv("../data/g4_tf_" + file + ".csv", sep = ",") # saving matrix
    
    print(file + ' TF Matrix saved')

In [None]:
def TF_IDF_apply(file, column = 'REVIEW', extension = 'xlsx'):
    """Function main
    
    Parameters:
        file : name of your xlsx file you want to use
        column : name of the column containing comments from your file (default = REVIEW)
    """
    sentences = database(file, column, extension)
    sentences.dropna(subset = [column], inplace = True)
    nlp_en = spacy.load('en_core_web_sm')
    sentences['data_clean'] = sentences.apply(lambda x: clean_data_apply(x.loc[column], nlp_en), axis = 1)
    pd.DataFrame(sentences['data_clean']).to_csv('../data/clean_sentences_'+file+'.csv', sep = ',')
    #create_tfidf_apply(sentences['data_clean'], file)

    
def TF(file, column = 'REVIEW'):
    """Function main
    
    Parameters:
        file : name of your xlsx file you want to use
        column : name of the column containing comments from your file (default = REVIEW)
    """
    sentences = database(file, column)
    sentences_clean = clean_data(sentences, column)
    create_tf(sentences_clean, file)

In [None]:
#To cut comment into sentences

def detect_sentences(comment):
    """Function to cut the entire comment into sentences
    
    Parameters : 
        comment : a comment you want to be sliced into sentences
        
    Attributes :
        tokenizer : english model to be able to cut the comment
        sentences : list of sentences of the comment
        
    Out : sentences
    
    """
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(comment)
    return sentences


def duplicate_row(df, column_name):
    
    """Function which duplicates comments times its number of sentences and associates them with each of the sentences
    
    Parameters : 
        df : matrix containing all comments imported from xlsx
        
    Attributes :
        Data : all sentences that are associated with its original comment
        listSentence : list of sentences
        listeReview : list of comments
        listeIndex : list of index of the original comment
        
    Out : Data
    """
    
    data=pd.DataFrame()
    listSentence=[]
    listeReview=[]
    listeIndex=[]
    try:
        df['index']=[i for i in range(len(df))]
    except:
        print("Index already exist")
    
    for i in tqdm_notebook(range(len(df))):
        try:
            comments = detect_sentences(df[column_name][i])
            for sent in range (len(comments)):
                listSentence.append(comments[sent])
                listeReview.append(df[column_name][i])
                listeIndex.append(i)
        except:
            listSentence.append("")
            listeReview.append("")
            listeIndex.append(i)
            
    data["Index"]=listeIndex 
    data[column_name]=listeReview
    data["Sentence"]=listSentence
    
    return Data


def comments_into_sentences(name, column_name = 'REVIEW'):
    """main function to cut comments into sentences
    
    Parameters : 
        name : name of the spreadsheet with comments
        column_name (DEFAULT = REVIEW) : name of the column where comments are located in the spreadsheet
        
    Attributes:
        data : matrix with values imported from the spreadsheet
        Data_Cut : matrix with sentences and associated comments of each sentence with originals index
    
    """
    
    try:
        data=pd.read_excel("../data/"+name +".xlsx")
    except:
        print("file " +name+ " not found")
    data=pd.DataFrame(data)
    data_cut=duplicate_row(data, column_name)
    pd.DataFrame(data_cut).to_csv("../data/g4_"+name+"_sentences_v1.csv")
    print("g4_"+name+"_sentences_v1.csv saved")

In [7]:
#Weighting of the word embedding by tfidf

def import_TFIDF(name):
    """Documentation

    Parameters :
    file (String ): Name of file 

    Attributes :
    data : DataFrame of the tf-idf

    out : data 
    """
    try:
        data = pd.read_csv("../data/g4_tfidf_" + name + ".csv") #import data of the tf-idf
    except:
        print("No tf-idf found in directory 'data' for the name : " + name)

    return data


def calcul_coordinates(words):
    """Documentation
    
    Parameters :
    words (DataFrame) : words we want to calculate coords 

    Attributes:
    list_coordinates (Array) : list of all coordinates of words 
    list_words(Array): list of all words
    useless_words : list of words that the program is unable to transform into coordinates  

    Out : 
    (final_list),useless_words : list of the matrix of coordinates from words and the list of useless words  
    """
    list_coordinates=[]
    list_words=[]
    useless_words=[]
    
    
    for i in range(len(words)):
        try:
            list_coordinates.append((dico[words[i]]))
            list_words.append(words[i])
        except :
            useless_words.append(words[i])
    final_list=pd.concat([pd.DataFrame(list_words),pd.DataFrame(list_coordinates)],axis=1,ignore_index=True)
    
    return ((final_list),useless_words)


def weight_embedding (name):
    """Function importing data from tf-idf and embedding and weighting values from the embedding with those from tf-idf
    
    Parameters:
        name : name of the file that you want to weight
        
    Attributes:
        tf_idf : matrix of the tf-idf with the input name
        embedding : matrix of the embedding with the input name
        words : list of words got from tf-idf matrix
        coordiantes : list of coordianates of each word
        useless_words : list of words that the program is unable to transform into coordinates
        
    Out :
        Weighted matrix
        
    
    """
    tf_idf = import_TFIDF(name) 
    
    words = tf_idf.columns.tolist()
    words=words[2:] #deleting the 2 first words of the list which are labels 
    
    coordinates, useless_words = calcul_coordinates(words)
    
    tf_idf=tf_idf.drop(useless_words,axis=1) # droping columns of the words that we are unable to calculate coordiantes with
    #Commentaires=tf_idf["commentaire"]
    index=tf_idf["Unnamed: 0"]
    tf_idf=tf_idf.drop(["Unnamed: 0"],axis=1) # droping comments and index columns we won't need
    coordinates=coordinates.set_index([0]) #setting words as index of the matrix

    weighted_matrix = []
    for i in range(len(tf_idf)):
        weighted_matrix.append(coordinates.mul(tf_idf.loc[i, :].T, axis = 0).sum(axis = 0) / tf_idf.loc[i, :].sum())
    embedingPonderate=pd.DataFrame(weighted_matrix)
    embedingPonderate.insert(0,"index",index)
   
    embedingPonderate.to_csv("../data/g4_weight_embedding_" + name + '.csv', sep =',')
    
    return embedingPonderate


In [6]:
def database_embedding(file):
    """Documentation

    Parameters :
    file (String ): Name of file 

    Attributes :
    data (DataFrame) : Data 
    comments (DataFrame) : All clean comments 

    out : comments 
    """
    data = pd.read_csv("../data/g4_tfidf_" + file + ".csv", index_col=0) # import data
    try:
        initial_file = pd.read_excel("../data/" + file + ".xlsx")
    except:
        initial_file = pd.read_csv("../data/" + file + ".csv")
    data['index'] = data.index
    data['commentaire'] = initial_file["Sentence"].loc[data.index]
    
    comments = data.loc[:,lambda data: ["index", "commentaire"]]  #Recover all comments 
    return comments

def coordinates(comments):
    """Documentation

    Parameters :
    comments (DataFrame) : All clean comments 

    Attributes:
    coordinates (Array) : All coordinates of comments 
    coordinate(Array): coordinates of one comment 
    counter ( integer): Count number of words in a comments 

    Out : 
    coordinates (Array) : All coordinates of comments 
    """

    coordinates = np.array([[0]*201]) #Initializes array of coordonates 
    
    for i in comments['index']:
        coordinate = np.array([[0]*200]) #Create array of coordonates for one comment 
        counter = 0
        try:
            for word in comments['commentaire'][i].split(): #Iterate through the comments 
                try:
                    coordinate = coordinate + np.array([dico[word]]) #Add coordinates for each word 
                    counter += 1
                except:
                    word
            coordinate = np.concatenate((np.array([[i]]), coordinate/counter), axis=1) #Coordinates of comment 
            if (i==0):
                coordinates = coordinate
            else:
                coordinates = np.concatenate((coordinates, coordinate), axis=0) # Add the coordonates of one comment to the array of coordinates
        except:
            i
        print(str(i) + '/' + str(len(comments)), end="\r")  
    return coordinates
            
def matrix(coordinates, comments):
    
    """Documentation

    Parameters :
    comments (DataFrame) : All clean comments 
    coordinates (Array) : All coordinates of comments 

    Attributes:
    coordinates (Array) : All coordinates of comments 
    liste(list):columns's Names 

    Out : 
    embedding ( Array): Coordinates + comments 
    """
    liste= ['index']
    for i in range(1, 201):
        liste.append(i) #Create index list 
    
    coordinates = pd.DataFrame(coordinates, columns = liste) # Transfor array to dataFrame 
    embedding = comments.join(coordinates, on=['index'], how='inner', lsuffix='_caller', rsuffix='_other') #Join the comments to the coordinates 
    
    return embedding

def embedding(file):
    """Documentation
    
    Parameters :
    file (String ): Name of file 
    
    Attributes:
    comments (DataFrame) : All clean comments 
    coordinate (Array) : All coordinates of comments 
    embedding ( Array): Coordinates + comments 
    
    Out : 
    Save the Dataframe Embedding  
    """
    comments = database_embedding(file)
    coordinate = coordinates(comments)
    embedding = matrix(coordinate, comments).drop(columns='index_caller').drop(columns='index_other') #Drop useless columns 
    pd.DataFrame(embedding).set_index('index').to_csv("../data/g4_embedding_" + file + ".csv", sep = ",") # save matrice
    print('Matrice embedding de ' + file + ' enregistrée')

In [None]:
def metric_purity(file):
    
    """
    This function allows us to create a matrix to evaluate our methods 

    Parameters :
    file (String) : A filename

    Out : 
    Saves a file containing for each sentence the identified facets
    """
    
    data = pd.read_excel('../data/'+file+'.xlsx', sep=',')
    data = data[['LABEL_SEAT', 'LABEL_BED',
                                       'LABEL_IFE', 'LABEL_LAVATORY SPACE',
                                       'LABEL_TEMPERATURE', 'LABEL_HUMIDITY',
                                       'LABEL_NOISE', 'LABEL_LOST BAGGAGE',
                                       'LABEL_CHECK IN', 'LABEL_BOARDING',
                                       'LABEL_SAV', 'LABEL_FOOD',
                                       'LABEL_CABIN CREW', 'LABEL_PRICE',
                                       'LABEL_PUNCTUALITY']]
    
    atmosphere = [0] * len(data)
    baggage = [0] * len(data)
    cabin_crew = [0] * len(data)
    comfort = [0] * len(data)
    empty = [0] * len(data)
    food = [0] * len(data)
    not_flight = [0] * len(data)
    price = [0] * len(data)
    punctuality = [0] * len(data)

    for i in tqdm_notebook(range(len(data))):
        if data['LABEL_SEAT'].iloc[i] > 0 or data['LABEL_BED'].iloc[i] > 0 or \
           data['LABEL_IFE'].iloc[i] > 0 or data['LABEL_LAVATORY SPACE'].iloc[i] > 0:
            comfort[i] = 1
        if data['LABEL_TEMPERATURE'].iloc[i] > 0 or data['LABEL_HUMIDITY'].iloc[i] > 0 or \
             data['LABEL_NOISE'].iloc[i] > 0:
            atmosphere[i] = 1
        if data['LABEL_LOST BAGGAGE'].iloc[i] > 0:
            baggage[i] = 1
        if data['LABEL_CHECK IN'].iloc[i] > 0 or data['LABEL_BOARDING'].iloc[i] > 0 or \
             data['LABEL_SAV'].iloc[i] > 0:
            not_flight[i] = 1
        if data['LABEL_FOOD'].iloc[i] > 0:
            food[i] = 1
        if data['LABEL_CABIN CREW'].iloc[i] > 0:
            cabin_crew[i] = 1
        if data['LABEL_PRICE'].iloc[i] > 0:
            price[i] = 1
        if data['LABEL_PUNCTUALITY'].iloc[i] > 0:
            punctuality[i] = 1
        if data.sum(axis = 1).iloc[i] == 0:
            empty[i] = 1
            
    data_facets_group = pd.DataFrame()
    data_facets_group['atmosphere'] = atmosphere
    data_facets_group['baggage'] = baggage
    data_facets_group['cabin_crew'] = cabin_crew
    data_facets_group['comfort'] = comfort
    data_facets_group['empty'] = empty
    data_facets_group['food'] = food
    data_facets_group['not_flight'] = not_flight
    data_facets_group['price'] = price
    data_facets_group['punctuality'] = punctuality
    data_facets_group.index = data.index
    
    data_facets_group.to_csv("../data/"+file+"_new_facets.csv", sep=',')

# <center> END