In [1]:
# Import packages
import operator
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import polyglot
from polyglot.downloader import downloader
from polyglot.text import Text
import wikipedia
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googletrans import Translator
translator = Translator()
from polyglot.detect import Detector
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup

In [85]:
class Vectors():
    def __init__(self, directory):
        self.directory = directory
    
    def make_vectors(self, directory):
        table = dict()
        with open(directory, 'r', encoding='utf-8', errors='ignore') as f:
            next(f)
            vectors = []
            for i, line in enumerate(f):
                word,vect = line.rstrip().split(" ",1)
                vect = np.fromstring(vect, sep=' ')
                table[word] = vect
        return table
    
    def paths (self):
        dict_tables = {}
        for v in self.directory:
            if v == 'wiki.multi.en.vec.txt':
                dict_tables['EN'] = self.make_vectors(v)
            elif v == 'wiki.multi.es.vec.txt':
                dict_tables['ES'] = self.make_vectors(v)
            elif v == 'wiki.multi.it.vec.txt':
                dict_tables['IT'] = self.make_vectors(v)
            elif v == 'wiki.multi.et.vec.txt':
                dict_tables['ET'] = self.make_vectors(v)
            elif v == 'wiki.multi.ru.vec.txt':
                dict_tables['RU'] = self.make_vectors(v)
            elif v == 'wiki.multi.de.vec.txt':
                dict_tables['DE'] = self.make_vectors(v)

        return dict_tables

In [86]:
class Keywords():
    def __init__(self, keywords):
        self.keywords = keywords

    def make_keywords(self, table_lang):
        key_dict = dict()
        for k in self.keywords:
            key_dict[k] = table_lang[k]
        return key_dict
    
    def make_empty_dictionaries(self):
        similarities_lang = dict()
        for k in self.keywords:
            similarities_lang[k] = np.asarray([[0]])
        return similarities_lang
    
    def empty_dictionaries(self, dict_table):
        similarities_per_language = dict()
        for k,v in dict_table.items():
            similarities_per_language[k] = (self.make_empty_dictionaries())
        return similarities_per_language

In [126]:
class MakeBot():
   
    # Function that captures the highest similarity within the token and the keywords for a language
    def highest_similarity_for_keyword(self, empty_dictionaries, dict_multi_vectors, key, value):
        last_status = empty_dictionaries[key]
        similarity = cosine_similarity(dict_multi_vectors.reshape(1,300), value)
        if similarity > last_status:
            empty_dictionaries[key] = similarity
            
    # Function that compares the similarities between all languages and returns the highest one overall
    def highest_sim_over_language(self, empty_dictionaries, dict_multi_vectors, key, value, multibot_dict):
    
        self.highest_similarity_for_keyword(empty_dictionaries['EN'], dict_multi_vectors['EN'], key, value)
        
        try:
            #Sp
            self.highest_similarity_for_keyword(empty_dictionaries['ES'], dict_multi_vectors['ES'], key, value)
        except:
            pass

        try:
            #It
            self.highest_similarity_for_keyword(empty_dictionaries['IT'], dict_multi_vectors['IT'], key, value)
        except:
            pass

        try:
            #Rus
            self.highest_similarity_for_keyword(empty_dictionaries['RU'], dict_multi_vectors['RU'], key, value)
        except:
            pass

        try:
            #Est
            self.highest_similarity_for_keyword(empty_dictionaries['ET'], dict_multi_vectors['ET'], key, value)
        except:
            pass
        
        try:
            #Est
            self.highest_similarity_for_keyword(empty_dictionaries['DE'], dict_multi_vectors['DE'], key, value)
        except:
            pass

        # sort the keys by the highest cosine values of all languages
        sorted_results = dict()
        for k,v in empty_dictionaries.items():
            sorted_results[k] = sorted(v.items(), key=operator.itemgetter(1), reverse=True)
            list_key_value = [[k,v] for k, v in sorted_results.items()]
            
        for items in list_key_value:
            lang = items[0]
            pairs = items[1]
            multibot_dict[pairs[0][0], lang] = pairs[0][1]

        sorted_simil = sorted(multibot_dict.items(), key=operator.itemgetter(1), reverse=True)
        
        return sorted_simil
    

  
    def compute_highest_token(self, bot_dictionary, empty_dictionaries, cutoff,  dict_multi_vectors,
                              highest_token_dict, multibot_dict):
       
        #bot dictionary is the dictionary of the CLWEs for the keywords selected
        for key, value in bot_dictionary.items():
            value = value.reshape(1,300)
            # we keep in the dict the value that had the highest cosine similarity between key and CLWE
            sorted_simil = self.highest_sim_over_language(empty_dictionaries, dict_multi_vectors, key, value, multibot_dict)
        
        language = sorted_simil[0][0][1]
        # get the highest key-value pair
        if sorted_simil[0][1][0][0] > cutoff:
            highest_token_dict[sorted_simil[0][0][0]] = sorted_simil[0][1][0][0] 
        # check if the highest key-value pair is over the threshold and append for the confidence
        # CONFIDENCE IS GIVEN BY THE SUM OF CAPTURED KEYWORDS ABOVE THE THRESHOLD
        #if sorted_simil[0][1][0][0] > cutoff:
         #   confidence.append(sorted_simil[0][1][0][0] + boost)
            
        return language
    
    
    def compute_confidence(self, language, bot_dictionary,  dict_multi_vectors, multibot_dict, confidence, cutoff, boost):
        # sort the keys by the highest cosine values of all languages
        for key, value in bot_dictionary.items():
            value = value.reshape(1,300)
            similarity = cosine_similarity(dict_multi_vectors[language].reshape(1,300), value)
            if similarity[0][0] > cutoff:
                confidence.append(similarity[0][0] + boost)

                
    def token2multi_vectors(self, token, all_vectors):
        UNK = np.zeros((1, 300))
        dict_vect = dict()
        for k,v in all_vectors.items():
            try:
                dict_vect[k] = v[token].reshape(1,300)
            except:
                dict_vect[k] = UNK
        return dict_vect  

In [127]:
class PolyBot():
    def __init__(self, keywords, dict_CLWEs, kw_lang, answer, cutoff=0.43, boost=0.5, all_lang=None, bigrams=None, 
                 bigram_cutoff = 0.8, bigram_boost = 1):
        self.keywords = keywords
        self.dict_CLWEs = dict_CLWEs
        self.kw_lang = kw_lang
        self.answer = answer
        self.bigrams = bigrams
        self.cutoff = cutoff
        self.boost = boost
        self.all_lang = all_lang
        self.bigram_cutoff = bigram_cutoff
        self.bigram_boost = bigram_boost
        self.k = Keywords(self.keywords)
        self.m = MakeBot()
        
    def prepare_bot(self):
        empty_dictionaries = self.k.empty_dictionaries(self.dict_CLWEs)
        confidence = []
        multibot_dict = dict()
        highest_token = dict()
        return  empty_dictionaries, confidence, multibot_dict, highest_token
        
    def language_identifier (self, empty_dictionaries, dict_multi_vectors, highest_token_dict, multibot_dict):
        bot_dictionary = self.k.make_keywords(self.dict_CLWEs[self.kw_lang]) 
        
        language = self.m.compute_highest_token(bot_dictionary, empty_dictionaries, self.cutoff,
                                     dict_multi_vectors, highest_token_dict, multibot_dict)
        
        return language
    
    
    def get_confidence(self, dict_multivector, confidence, multibot_dict, language):
        bot_dictionary = self.k.make_keywords(self.dict_CLWEs[self.kw_lang])
        self.m.compute_confidence(language, bot_dictionary, dict_multivector, multibot_dict, confidence, self.cutoff, self.boost)
    
    def else_language(self, dict_multi_vectors, dict_next_multivector):
        source_lang = []
        else_lang = []
        next_source_lang = []
        next_else_lang = []
        
        for k, v in dict_multi_vectors.items():
            if k == self.kw_lang:
                source_lang.append(v)
            else:
                else_lang.append(v)
        for k, v in dict_next_multivector.items():
            if k == self.kw_lang:
                next_source_lang.append(v)
            else:
                next_else_lang.append(v) 
        return source_lang, else_lang, next_source_lang, next_else_lang
    
    def score_bigrams(self, w1,w2, source_lang, else_lang, next_source_lang, next_else_lang, confidence):
        for i in range(len(source_lang)):
            if cosine_similarity(self.dict_CLWEs[self.kw_lang][str(w1)].reshape(1,300), source_lang[i]) + cosine_similarity(self.dict_CLWEs[self.kw_lang][str(w2)].reshape(1,300), next_source_lang[i]) > 1.6:
                confidence.append(self.bigram_boost)
                return True
        for i in range(len(else_lang)):
            if cosine_similarity(self.dict_CLWEs[self.kw_lang][str(w1)].reshape(1,300), else_lang[i]) + cosine_similarity(self.dict_CLWEs[self.kw_lang][str(w2)].reshape(1,300), next_else_lang[i]) > self.bigram_cutoff:
                self.confidence.append(self.bigram_boost)
                return True
    
    def compute_bigrams(self, w1, w2, dict_multivector, dict_next_multivector, confidence):
        source_lang, else_lang, next_source_lang, next_else_lang = self.else_language(dict_multivector, dict_next_multivector)
        w = self.score_bigrams(w1,w2, source_lang, else_lang, next_source_lang, next_else_lang, confidence)
        return w

In [128]:
class Conversation():
    
    def __init__(self):
        self.m = MakeBot()

    # Function to wikify the entity
    def selenium (self, url):
        chrome_options = Options() 
        chrome_options.add_argument("--headless") 
        driver = webdriver.Chrome(r'C:/Users/Jason/Downloads/chromedriver_win32/chromedriver.exe', options=chrome_options)
        driver.get(url)
        driver.find_element_by_link_text('English').click()
        name= driver.title
        LANGS = [
        'ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž',   # Czech
        'ÄäÖöÜüẞß',                         # German
        'ĄąĆćĘęŁłŃńÓóŚśŹźŻż',               # Polish
        'áéóíñü',                           # Spanish
        'àèòìù'                             #Italian
        ]
        pattern = r'[A-Z][a-z{langs}]+'.format(langs=''.join(LANGS))
        pattern = re.compile(pattern)
        match = pattern.findall(name)
        match = match[0:-1]
        try:
            if match [1] != None:
                return match[0] + " " + match[1]
        except:
            return match[0]
    
    def prep_city(self, from_phrase, parola, lang, all_vectors):
        self.table_lang = all_vectors['EN']
        prep = 0
        print('BOT:', from_phrase) 
        city = input('YOU: ')
        tok = city.lower().split()
        for i in range(len(tok)):
            vect_dict = self.m.token2multi_vectors(tok[i], all_vectors)
            for k,v in vect_dict.items():
                if lang == 'en':
                    if cosine_similarity(v, self.table_lang[parola].reshape(1,300)) > 0.8:
                        prep += 1
                else:
                    if cosine_similarity(v, self.table_lang[parola].reshape(1,300)) > 0.3:
                        prep += 1       
        if prep > 0: 
            try:
                if tok[2] != None:
                    CITY = tok[1].capitalize() + " " + tok[2].capitalize()      
            except:
                try:
                    CITY = tok[1].capitalize()
                except:
                    CITY = city.capitalize()
        else:
            CITY = city.capitalize()
            
        if lang != 'en':
            try:
                website = 'https://{}.wikipedia.org/wiki/{}'.format(lang, CITY)
                CITY = self.selenium(website)
            except:
                CITY = translator.translate(CITY, dest=lang).text
       
        return CITY 
   
    # function that intercepts the vectors related to the time of the day
    def when_def(self,when_dict, conf_value, vect_dict):
        sim_today = []
        for key, value in when_dict.items():
            value = value.reshape(1,300)
            for k,v in vect_dict.items():
                sim = cosine_similarity(v, value)
                if sim > conf_value:
                    sim_today.append(sim)
        return sum(sim_today)
    
    def find_TodayTomorrow(self, vect_dict, all_vectors):
        today = False
        today_key = {'today':all_vectors['EN']['today'], 'now':all_vectors['EN']['now']}
        oggi = self.when_def(today_key, 0.5, vect_dict)
        tomorrow = False
        tomorrow_key = {'tomorrow':all_vectors['EN']['tomorrow']}
        domani = self.when_def(tomorrow_key, 0.45, vect_dict)
        if oggi > 0 and oggi > domani:
            today = True
        if domani > 0 and domani > oggi:
            tomorrow = True
        return today, tomorrow
    
    def NER(self, human):
        PER = []
        LOC = []
        ORG = []
        try:
            text = Text(human)
            for sent in text.sentences:
                for e in sent.entities:
                    if e.tag == 'I-LOC':
                        LOC.append(e)
                    elif e.tag == 'I-PER':
                        PER.append(e)
                    elif e.tag == 'I-ORG':
                        ORG.append(e)            
        except:
            pass
        
        return PER, LOC, ORG
    
    def find_city(self, LOC):
        # Capture if a city is given in the input
         #Entities (cities, time, people)
        
        CITY = None
        try: 
            if LOC[0][0] != None:
                CITY = LOC[0][0]
            if LOC[0][1] != None:
                CITY = LOC[0][0] + " "+ LOC[0][1]
        except:
            pass
        if CITY != None:
            CITY = CITY.capitalize()

        CITY2 = None
        try: 
            if LOC[1][0] != None:
                CITY2 = LOC[1][0]
            if LOC[1][1] != None:
                CITY2 = LOC[1][0] + " "+ LOC[1][1]
        except:
            pass
        if CITY2 != None:
            CITY2 = CITY2.capitalize()
            
        return CITY,CITY2

    def find_people(self, PER):
        PERSON = None
        try: 
            if PER[0][0] != None:
                PERSON = PER[0][0]
            if PER[0][1] != None:
                PERSON = PER[0][0] + " "+ PER[0][1]
        except:
            pass
        if PERSON != None:
            return PERSON
    
    def detector_language(self, detector, lang):
        # we use Polyglot only if there is a conflict between IT and SP and we give precedence to Polyglot
        if detector == 'Spanish' and lang == 'IT':
            lang = 'ES'
        elif detector == 'Italian' and lang == 'ES':
            lang = 'IT'
        return lang
    

    def check_bigram_city(self, CITY):
        if CITY != None:
            CITY = CITY.split()
            try:
                if CITY[1] != None:
                    CITY = CITY[0].capitalize() + " " + CITY[1].capitalize()
            except:
                CITY = "".join(CITY).capitalize()
                
        return CITY
        
       
    def talk(self):
        
        NER_dict = dict()
        while True:
            
            human = input('YOU: ')
            if human == 'bye':
                break
            
            PER,LOC, ORG = self.NER(human)
            leaving = None
            arriving = None
            detector = None
            
             # Language Detector
            try:
                detector = Detector(human).language.name
            except:
                pass
            
            human = human.split()
            hum = []

            for tok in human:
                hum.append(re.sub(r'[^\w]', '', tok))
            hum.append('EOS')

            NER_dict['people'] = PER
            NER_dict['cities'] = LOC
            NER_dict['organizations'] = ORG
            NER_dict['leaving'] = leaving
            NER_dict['arriving'] = arriving
            NER_dict['detector'] = detector
            
            return hum, NER_dict

In [9]:
#CLWEs = Vectors(['wiki.multi.en.vec.txt', 'wiki.multi.es.vec.txt', 'wiki.multi.de.vec.txt', 'wiki.multi.et.vec.txt',
                     #'wiki.multi.it.vec.txt', 'wiki.multi.ru.vec.txt'])
CLWEs = Vectors(['wiki.multi.en.vec.txt', 'wiki.multi.it.vec.txt'])
all_vectors = CLWEs.paths()

In [129]:
c = Conversation()
m = MakeBot()

In [130]:
def answer_weather(NER_dict):
    return print('HELLO!')

In [131]:
# keywords, dict_CLWEs, kw_lang, answer, bigrams=None, cutoff=0.43, boost=0.5, all_lang=None
weatherbot = PolyBot(['sun', 'rain'], all_vectors, 'EN', answer_weather, bigrams=[('what', 'time'), ('what', 'is')])
travelbot = PolyBot(['travel', 'car'], all_vectors, 'EN', answer_weather)
bots = [weatherbot, travelbot]

In [132]:
weatherbot.bigrams

[('what', 'time'), ('what', 'is')]

In [135]:
def conv (bots):
    
    while True:
 #### CREATE DICTIONARIES ################################################################################################
        bots_dict = dict()
        for i, bot in enumerate(bots):
            empty_dictionaries, confidence, multi, highest_token = bot.prepare_bot()
            bots_dict['confidence'+str(i)] = confidence
            bots_dict['empty_dictionaries'+str(i)] = empty_dictionaries
            bots_dict['multi_dict'+str(i)] = multi
            bots_dict['highest_token'+str(i)] = highest_token
        
    ### START CONVERSATION #######################################################################################################
        try:
            #NER
            hum, NER_dict= c.talk()

        except:
            print('BOT: Have a nice day')
            break

        
        for i in range (len(hum)-1):
            # normalize the input
            token = hum[i].lower()
            next_token = hum[i+1].lower()
             
            # Assign the multilingual vectors to the input words
            vect_dict = m.token2multi_vectors(token, all_vectors)
            next_vect_dict= m.token2multi_vectors(next_token, all_vectors)
            today,tomorrow= c.find_TodayTomorrow(vect_dict, all_vectors)
            
            
            for i,bot in enumerate(bots):
                empty_dictionaries = bots_dict['empty_dictionaries'+str(i)]
                confidence =  bots_dict['confidence'+str(i)]
                highest_token =  bots_dict['highest_token'+str(i)]
                multi_dict = bots_dict['multi_dict'+str(i)]
                language_bot = bot.language_identifier(empty_dictionaries, vect_dict, highest_token, multi_dict)
                bot.get_confidence(vect_dict, confidence, multi_dict, language_bot)
                bots_dict['language_bot'+str(i)] = language_bot
                
                if bot.bigrams != None:
                    for i in range(len(bot.bigrams)):
                        #  w1, w2, dict_multivector, dict_next_multivector, confidence
                        w1 = bot.bigrams[i][0]
                        w2 = bot.bigrams[i][1]
                        is_true = bot.compute_bigrams(w1,w2, vect_dict, next_vect_dict, confidence)
                        bots_dict[w1 + '_' + w2] = is_true
            
        all_confs = dict()
        for i, bot in enumerate(bots):
            confidence =  bots_dict['confidence'+str(i)]
            summed_conf = sum(confidence)
            all_confs['conf'+str(i)] = summed_conf

        sorted_confs = sorted(all_confs.items(), key=operator.itemgetter(1), reverse=True)

        print(sorted_confs, 'sorted CONFS')    
            
  

In [136]:
conv(bots)

YOU: sun


Detector is not able to detect the language reliably.


[('conf0', 1.5), ('conf1', 0)] sorted CONFS
YOU: sun sun sun travel


Detector is not able to detect the language reliably.


[('conf0', 4.5), ('conf1', 1.5000000000000002)] sorted CONFS
YOU: bye
BOT: Have a nice day
