# 0 Data

In [11]:
def topicData():
    topic = pd.read_csv('../training_data/topic.csv',header=None).values.T[0]
    return topic

def eventData():
    event = pd.read_csv('../training_data/event.csv',header=None).values.T[0]
    return event

def callData():
    call = pd.read_csv('../training_data/call.csv',header=None).values.T[0]
    return call

def waterData():
    water = pd.read_csv('../training_data/water.csv',header=None).values.T[0]
    return water

def visitorData():
    visitor = pd.read_csv('../training_data/visitor.csv',header=None).values.T[0]
    return visitor

def packageData():
    visitor = pd.read_csv('../training_data/package.csv',header=None).values.T[0]
    return visitor

# 1 English vocab list

In [28]:
import pandas as pd
import numpy as np
from stemming.porter2 import stem
from nltk.stem import PorterStemmer
from numpy import linalg as LA

def getVocabList():
    vocabs = pd.read_csv('../vocab2.csv')
    vocabs['number'] = vocabs.index + 1
    vocabs['word'] = vocabs['0']
    del vocabs['0']
    return vocabs

In [35]:
ps = PorterStemmer()
words = [ps.stem(word) for word in words ]
words

['game', 'game', 'game', 'game']

In [3]:
print(getVocabList().shape)
getVocabList()[0:10]

(223737, 2)


Unnamed: 0,number,word
0,1,a
1,2,aa
2,3,aaa
3,4,aah
4,5,aal
5,6,aalii
6,7,aam
7,8,aani
8,9,aardvark
9,10,aardwolf


# 2 Process text to feature vector

In [4]:
import re
from stemming.porter2 import stem

def processText(email_contents):
    #load vocab
    vocabList = getVocabList()
    
    # ----- Process Email------
    # Lower Case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    strip_all_html = re.compile('[>,<,<*>]') 
    email_contents = re.sub(strip_all_html, '', email_contents)
    strip_all_html2 = re.compile('\s') # \s is equivalent to the class [ \t\n\r\f\v].
    email_contents = re.sub(strip_all_html2, ' ', email_contents)
    
    # Handle Numbers
    # Look for one or more characters between 0-9
    hundle_number = re.compile('\d+')
    email_contents = re.sub(hundle_number, 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    hundle_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    email_contents = re.sub(hundle_url, 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    hundle_email = re.compile('[\w\.-]+@[\w\.-]+')
    email_contents = re.sub(hundle_email, 'emailaddr', email_contents)

    # Handle $ sign
    hundle_dollar = re.compile('[$]+')
    email_contents = re.sub(hundle_dollar, 'dollar', email_contents)
    
    # Remove any non alphanumeric characters
    non_cha_alp = re.compile("[^a-zA-Z0-9]+")
    email_contents = re.sub(non_cha_alp, ' ', email_contents)
    
    # ------- Stem words -------    
    ps = PorterStemmer()
    words = [ps.stem(word) for word in email_contents.split(" ") if len(word) > 0]
    
    # convert to number in vocab
    word_indices = []
    for w in words:
        match = sum(vocabList['word'] == w)
        if(match>0):
            word_indices.append(vocabList.loc[(vocabList['word'] == w),'number'].astype(int).values[0])
    
    return word_indices

In [5]:
processText("Send someone to repair my bathroom.")

[171879, 178439, 196596, 161447, 118766, 18448]

In [6]:
def textFeatures(word_indices):
    vocabList = getVocabList()
    features = vocabList['number'].astype(int).isin(word_indices) + 0
    return np.array(features)

In [7]:
textFeatures(processText("Send someone to repair my bathroom."))

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# 3 Initial features vector for each menu

In [10]:
def getInitialFeatureVectoc():
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    package = packageData()
    # Topic
    topic_features = np.zeros(len(getVocabList()))
    for i in range(len(topic)):
        topic_features = topic_features + textFeatures(processText(topic[i]))

    # Event
    event_features = np.zeros(len(getVocabList()))
    for i in range(len(event)):
        event_features = event_features + textFeatures(processText(event[i]))
    
    # Call     
    call_features = np.zeros(len(getVocabList()))
    for i in range(len(call)):
        call_features = call_features + textFeatures(processText(call[i]))
    
    # Water
    water_features = np.zeros(len(getVocabList()))
    for i in range(len(water)):
        water_features = water_features + textFeatures(processText(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(getVocabList()))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeatures(processText(visitor[i]))
        
    # Package
    package_features = np.zeros(len(getVocabList()))
    for i in range(len(package)):
        package_features = package_features + textFeatures(processText(package[i]))
    
    topic_features = (topic_features >= 1).astype(int)
    event_features = (event_features >= 1).astype(int)
    call_features = (call_features >= 1).astype(int)
    water_features = (water_features >= 1).astype(int)
    visitor_features = (visitor_features >= 1).astype(int)
    package_features = (package_features >= 1).astype(int)
    
    return np.array([topic_features, event_features, call_features, water_features, visitor_features, package_features])

In [11]:
initial_features = getInitialFeatureVectoc()
initial_features

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 4 Compare similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def most_similarity(initial_features,text_feature):
    A = np.vstack((initial_features,text_feature))
    A_sparse = sparse.csr_matrix(A)
    similarities = cosine_similarity(A_sparse)
    print('pairwise dense output:\n {}\n'.format(similarities))

    text_vs_initial = similarities[similarities.shape[0]-1,0:similarities.shape[1]-1]
    prob_of_menu = np.max(text_vs_initial)
    menu = np.argmax(text_vs_initial)
    if(prob_of_menu < 0.3):
        return "other"
    else:
        if(menu == 0):
            return "topic"
        elif(menu == 1):
            return "event"
        elif(menu == 2):
            return "call"
        elif(menu == 3):
            return "water"
        elif(menu == 4):
            return "visitor"
        else:
            return "package"


In [13]:
most_similarity(initial_features,textFeatures(processText("I have to send it offline.")))

pairwise dense output:
 [[ 1.          0.07715167  0.3678836   0.10101525  0.0805823   0.18898224
   0.21821789]
 [ 0.07715167  1.          0.06622662  0.          0.17407766  0.          0.        ]
 [ 0.3678836   0.06622662  1.          0.086711    0.13834289  0.16222142
   0.18731716]
 [ 0.10101525  0.          0.086711    1.          0.11396058  0.          0.        ]
 [ 0.0805823   0.17407766  0.13834289  0.11396058  1.          0.          0.        ]
 [ 0.18898224  0.          0.16222142  0.          0.          1.
   0.8660254 ]
 [ 0.21821789  0.          0.18731716  0.          0.          0.8660254
   1.        ]]



'package'

# 5 Optimization

### Vocab List

In [14]:
def createVocabOpt():
    vocabs = pd.read_csv('../vocab2.csv')
    
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    package = packageData()
    
    # Topic
    topic_features = np.zeros(len(vocabs))
    for i in range(len(topic)):
        topic_features = topic_features + textFeatures(processText(topic[i]))

    # Event
    event_features = np.zeros(len(vocabs))
    for i in range(len(event)):
        event_features = event_features + textFeatures(processText(event[i]))
    
    # Call     
    call_features = np.zeros(len(vocabs))
    for i in range(len(call)):
        call_features = call_features + textFeatures(processText(call[i]))
    
    # Water
    water_features = np.zeros(len(vocabs))
    for i in range(len(water)):
        water_features = water_features + textFeatures(processText(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(vocabs))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeatures(processText(visitor[i]))

    # Package
    package_features = np.zeros(len(vocabs))
    for i in range(len(package)):
        package_features = package_features + textFeatures(processText(package[i]))
    
    all_words = topic_features + event_features + call_features + water_features + visitor_features + package_features
    vocabs = vocabs[all_words >= 1]
    vocabs['word'] = vocabs['0']
    del vocabs['0']
    
    vocabs.to_csv('../vocab_opt.csv',index=False)
    
    return "Create Done!"

In [15]:
createVocabOpt()

'Create Done!'

# ------------------------------ END Train -----------------------------

In [1]:
def getVocabListOpt():
    vocabs = pd.read_csv('../vocab_opt.csv')
    vocabs['number'] = vocabs.index + 1
    return vocabs

In [4]:
print(getVocabListOpt().shape)
getVocabListOpt().head()

(56, 2)


Unnamed: 0,word,number
0,a,1
1,activ,2
2,and,3
3,are,4
4,ask,5


In [5]:
import re
from stemming.porter2 import stem

def processTextOpt(email_contents):
    #load vocab
    vocabList = getVocabListOpt()
    
    # ----- Process Email------
    # Lower Case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    strip_all_html = re.compile('[>,<,<*>]') 
    email_contents = re.sub(strip_all_html, '', email_contents)
    strip_all_html2 = re.compile('\s') # \s is equivalent to the class [ \t\n\r\f\v].
    email_contents = re.sub(strip_all_html2, ' ', email_contents)
    
    # Handle Numbers
    # Look for one or more characters between 0-9
    hundle_number = re.compile('\d+')
    email_contents = re.sub(hundle_number, 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    hundle_url = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    email_contents = re.sub(hundle_url, 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    hundle_email = re.compile('[\w\.-]+@[\w\.-]+')
    email_contents = re.sub(hundle_email, 'emailaddr', email_contents)

    # Handle $ sign
    hundle_dollar = re.compile('[$]+')
    email_contents = re.sub(hundle_dollar, 'dollar', email_contents)
    
    # Remove any non alphanumeric characters
    non_cha_alp = re.compile("[^a-zA-Z0-9]+")
    email_contents = re.sub(non_cha_alp, ' ', email_contents)
    
    # ------- Stem words -------
    ps = PorterStemmer()
    words = [ps.stem(word) for word in email_contents.split(" ") if len(word) > 0]
    
    # convert to number in vocab
    word_indices = []
    for w in words:
        match = sum(vocabList['word'] == w)
        if(match>0):
            word_indices.append(vocabList.loc[(vocabList['word'] == w),'number'].astype(int).values[0])
    
    return word_indices

In [6]:
processTextOpt("Send someone to repair my bathroom.")

[41, 44, 50, 40, 30, 6]

In [7]:
def textFeaturesOpt(word_indices):
    vocabList = getVocabListOpt()
    features = vocabList['number'].astype(int).isin(word_indices) + 0
    return np.array(features)

In [8]:
textFeaturesOpt(processTextOpt("Send someone to repair my bathroom."))

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [35]:
def getInitialFeatureVectorOpt():
    topic = topicData()
    event = eventData()
    call = callData()
    water = waterData()
    visitor = visitorData()
    package = packageData()
    # Topic
    topic_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(topic)):
        topic_features = topic_features + textFeaturesOpt(processTextOpt(topic[i]))

    # Event
    event_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(event)):
        event_features = event_features + textFeaturesOpt(processTextOpt(event[i]))
    
    # Call     
    call_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(call)):
        call_features = call_features + textFeaturesOpt(processTextOpt(call[i]))
    
    # Water
    water_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(water)):
        water_features = water_features + textFeaturesOpt(processTextOpt(water[i]))
    
    # Visitor
    visitor_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(visitor)):
        visitor_features = visitor_features + textFeaturesOpt(processTextOpt(visitor[i]))

    # Package
    package_features = np.zeros(len(getVocabListOpt()))
    for i in range(len(package)):
        package_features = package_features + textFeaturesOpt(processTextOpt(package[i]))
    
#     topic_features = (topic_features >= 1).astype(int)
#     event_features = (event_features >= 1).astype(int)
#     call_features = (call_features >= 1).astype(int)
#     water_features = (water_features >= 1).astype(int)
#     visitor_features = (visitor_features >= 1).astype(int)
#     package_features = (package_features >= 1).astype(int)
    # Scale to Unit Vector
    topic_features = topic_features/LA.norm(topic_features)
    event_features = event_features/LA.norm(event_features)
    call_features = call_features/LA.norm(call_features)
    water_features = water_features/LA.norm(water_features)
    visitor_features = visitor_features/LA.norm(visitor_features)
    package_features = package_features/LA.norm(package_features)
    
    return np.array([topic_features, event_features, call_features, water_features, visitor_features, package_features])

In [36]:
initial_features_opt = getInitialFeatureVectorOpt()
initial_features_opt

array([[ 0.1796053 ,  0.        ,  0.1796053 ,  0.        ,  0.        ,
         0.1796053 ,  0.3592106 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.3592106 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.1796053 ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.1796053 ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.53881591,
         0.3592106 ,  0.        ,  0.        ,  0.1796053 ,  0.1796053 ,
         0.1796053 ,  0.        ,  0.        ,  0.        ,  0.1796053 ,
         0.        ,  0.1796053 ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.40824829,  0.        ,  0.        ,  0.20412415,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.     

### Homecare
- ผนังห้องนอนร้าวส่งช่างมาซ่อมหน่อย --> The bedroom is cracked and repaired.

### Event / Activity
- สอบถามกิจกรรมดูหนัง --> View Forum Posts

### ค่าน้ำ
- ค่าน้ำเดือนที่เเล้วเท่าไหร่ --> How much water monthly?

### Call - Negative
- ไฟดับนานมาก ดับมา 2 ชั่วโมงแล้ว มาซ่อมสักที --> The power went out for 2 hours and then came back.

### สอบถามทั่วไป เกี่ยวกับโครงการ
- สอบถามโครงการเซนทริกรัชโยธินครับ --> Ask for the Centric Ratchayothin project.

In [37]:
most_similarity(initial_features_opt,textFeaturesOpt(processTextOpt("I have to send it offline.")))

pairwise dense output:
 [[ 1.          0.03666178  0.31933855  0.03666178  0.07184212  0.19050019
   0.21997067]
 [ 0.03666178  1.          0.02592379  0.          0.12247449  0.          0.        ]
 [ 0.31933855  0.02592379  1.          0.05184758  0.15240015  0.1796053
   0.20739034]
 [ 0.03666178  0.          0.05184758  1.          0.08164966  0.          0.        ]
 [ 0.07184212  0.12247449  0.15240015  0.08164966  1.          0.          0.        ]
 [ 0.19050019  0.          0.1796053   0.          0.          1.
   0.8660254 ]
 [ 0.21997067  0.          0.20739034  0.          0.          0.8660254
   1.        ]]



'package'

In [38]:
def menu_name(position):
    if(position == 0):
        return "topic"
    elif(position == 1):
        return "event"
    elif(position == 2):
        return "call"
    elif(position == 3):
        return "water"
    elif(position == 4):
        return "visitor"
    elif(position == 5):
        return "package"
    else:
        return "other"

In [39]:
def second_largest(numbers):
    count = 0
    m1 = m2 = float('-inf')
    for x in numbers:
        count += 1
        if x > m2:
            if x >= m1:
                m1, m2 = x, m1            
            else:
                m2 = x
    return m2 if count >= 2 else None

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def most_similarity2(initial_features,text_feature):
    A = np.vstack((initial_features,text_feature))
    A_sparse = sparse.csr_matrix(A)
    similarities = cosine_similarity(A_sparse)
    text_vs_initial = similarities[similarities.shape[0]-1,0:similarities.shape[1]-1]
    print('pairwise dense output: {}\n'.format(text_vs_initial))

    first_prob = np.max(text_vs_initial)
    second_prob = second_largest(text_vs_initial)
    if(first_prob < 0.2):
        return ["other"]

    first_position = np.argmax(text_vs_initial)
    second_position = np.where(text_vs_initial==second_prob)[0][0]
    print(first_prob - second_prob)
    if(first_prob - second_prob < 0.1):
        return [menu_name(first_position),menu_name(second_position)]
    else:
        return [menu_name(first_position)]

In [55]:
most_similarity2(initial_features_opt,textFeaturesOpt(processTextOpt("Want to see the movie.")))

pairwise dense output: [ 0.20739034  0.23570226  0.29329423  0.11785113  0.23094011  0.20412415]

0.0575919696472


['call', 'event']

In [67]:
def butler_menu(text):
    text_vector = textFeaturesOpt(processTextOpt(text))
    if LA.norm(text_vector) == 0:
        text_unit_vector = text_vector
    else:
        text_unit_vector = text_vector/LA.norm(text_vector)
    answer = most_similarity2(getInitialFeatureVectorOpt(),text_unit_vector)
    return answer

In [68]:
butler_menu("Want")

pairwise dense output: [ 0.  0.  0.  0.  0.  0.]



['other']