In [563]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### SP500 Data

In [603]:
# read from csv file
sp500_data = pd.read_excel('sp500.xlsx', header=None)

In [604]:
sp500_names = pd.DataFrame()
sp500_names['ticker'] = sp500_data.iloc[:,0]
sp500_names['name'] = sp500_data.iloc[:,1]

In [605]:
# names to lowercase
def lower(string):
    return string.lower()

sp500_names['name'] = sp500_names['name'].apply(lower)

In [606]:
sp500_names.head()

Unnamed: 0,ticker,name
0,MMM,3m company
1,ABT,abbott laboratories
2,ABBV,abbvie inc.
3,ACN,accenture plc
4,ATVI,activision blizzard


In [607]:
# freq analysis of names to see what to remove
split_names = pd.Series(sum([x.split(' ') for x in sp500_names['name']], []))

In [608]:
# remove common terms that appear more than freq times and get a 
freq = 3
term_count = split_names.value_counts()
remove_terms = list(term_count.loc[term_count>freq].index.values)

In [609]:
remove_terms.remove('a')
remove_terms.remove('co')
remove_terms.append('data')
remove_terms.append('price')
remove_terms.append('ge')
remove_terms.remove('brands')
remove_terms.remove('of')

In [610]:
clean_names = []
for x in sp500_names['name']:
    new_x = x
    for y in remove_terms:
        new_x = new_x.replace(y, '')
    clean_names.append(new_x.strip())

In [611]:
sp500_names['clean_name'] = clean_names
sp500_names.head()

Unnamed: 0,ticker,name,clean_name
0,MMM,3m company,3m
1,ABT,abbott laboratories,abbott laboratories
2,ABBV,abbvie inc.,abbvie
3,ACN,accenture plc,accenture
4,ATVI,activision blizzard,activision blizzard


### Keyword Approach

In [612]:
# removing stop words
from nltk.corpus import stopwords
import string

def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s


def to_words(review_text):
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    letters_only = remove_punctuation(review_text)
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    letters = list(string.ascii_lowercase)
    words_without_letters = [w for w in meaningful_words if not w in letters]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    
    
    return( " ".join( words_without_letters )) 

# remove verbs
# stemming (without stemming the nouns)

In [613]:
to_words("I would like to get last year's data on Apples lab")

'would like get last years data apples lab'

In [618]:
from fuzzywuzzy import fuzz as fw
import re

a = 0.6
# getting the Name
# ticker matching too

def name_finder(text):
    """
    Inputs: string sentence with removed stopwords
    
    Returns: company ticker
    
    Selects ticker that has max average match of all words in the input with the
    company names and company tickers. If there is a perfect match return the ticker
    directly and stop.
    """
    words = text.split(' ')
    max_scores = []
    tickers = []
    max_scores2 = []
    for w in words:
        scores = 0
        ticker = ''
        for i, row in sp500_names.iterrows():
            #nms = row['clean_name'].split(' ')
            nms = re.findall(r"[\w']+", row['clean_name'])
            fuzzs = []
            try:
                for nm in nms:
                    fuzz = fw.ratio(nm, w)
                    #if fuzz == 100:
                        #print(nm, w)
                    #    return row['ticker']
                    fuzzs.append(fuzz)
                if scores <= max(fuzzs):
                    scores = max(fuzzs)
                    ticker = row['ticker']
            except:
                continue
        max_scores.append(scores)
        tickers.append(ticker)
    for tk in tickers:
        scores2 = 0
        for w in words:
            fuzz2 = fw.ratio(tk.lower(), w)
            if fuzz2 == 100:
                #print(tk, w)
                return tk
            if scores2 <= fuzz2:
                scores2 = fuzz2
        max_scores2.append(scores2)
    avg_score = a*np.array(max_scores) + (1.0-a)*np.array(max_scores2)
    df = pd.DataFrame()
    df['ticker'] = tickers
    df['name_score'] = max_scores
    df['ticker_score'] = max_scores2
    df['avg_score'] = avg_score
    #print(df.sort_values('avg_score', ascending=False))
    return tickers[np.argmax(avg_score)]

In [619]:
name_finder(to_words("Stock prices for apple and microsoft"))

u'AAPL'

In [620]:
sp500_names[sp500_names['ticker'] == 'MSFT']

Unnamed: 0,ticker,name,clean_name
315,MSFT,microsoft corp.,microsoft


### Testing

In [621]:
test_data = pd.read_csv('test_names.csv')

In [622]:
test_data.head()

Unnamed: 0,Training set,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,,,,for time: https://nlp.stanford.edu/projects/ti...,,,,
1,,,,,,,,
2,,,IGNORE,IGNORE,IGNORE,,,
3,Question,,Time (relational to today),Range,Metric,Name,,
4,Can you give me Apples stock for yesterday,,t-1,1,price,AAPL,,


In [623]:
test_data = test_data.loc[4:]

In [624]:
data_test = pd.DataFrame()
data_test['question'] = test_data['Training set']
data_test['name'] = test_data.iloc[:,5]

In [625]:
data_test.head()

Unnamed: 0,question,name
4,Can you give me Apples stock for yesterday,AAPL
5,How did Google do last year?,GOOGL
6,How well did Accenture stock do last month?,ACN
7,What was BlackRock's price yesterday,BLK
8,Give me Boeing's price yesterday,BA


In [626]:
# accuracy of keyword matching
data_test['clean_question'] = data_test['question'].apply(to_words)
data_test['score'] = data_test['clean_question'].apply(name_finder)

In [627]:
# include match
data_test['acc'] = [1 if x[0] == x[1] else 0
                    for x in zip(data_test['name'], data_test['score'])]

In [628]:
data_test.head()

Unnamed: 0,question,name,clean_question,score,acc
4,Can you give me Apples stock for yesterday,AAPL,give apples stock yesterday,AAPL,1
5,How did Google do last year?,GOOGL,google last year,GT,0
6,How well did Accenture stock do last month?,ACN,well accenture stock last month,ACN,1
7,What was BlackRock's price yesterday,BLK,blackrocks price yesterday,BLK,1
8,Give me Boeing's price yesterday,BA,give boeings price yesterday,BA,1


In [629]:
# accuracy
print('accuracy: ', np.sum(data_test['acc'])/float(data_test.shape[0]))

('accuracy: ', 0.796875)


In [630]:
data_test[data_test['acc'] == 0]['name'].value_counts()

MMM      2
AMZN     2
GE       2
GOOG     2
GOOGL    1
AT&T     1
ATT      1
EBAY     1
MSFT     1
Name: name, dtype: int64

In [592]:
data_test[data_test['acc'] == 0]

Unnamed: 0,question,name,clean_question,score,acc
5,How did Google do last year?,GOOGL,google last year,GT,0
20,What was Ebay's px yesterday,EBAY,ebays px yesterday,PX,0
28,Get GE's price 52w ago,GE,get ges price 52w ago,WFC,0
36,Give me Alphabet Inc. price 3M low,GOOG,give alphabet inc price 3m low,MMM,0
37,give me the share price of either Google share...,GOOG,give share price either google shares last done,SCHW,0
38,MMM price seven weeks ago,MMM,mmm price seven weeks ago,MGM,0
41,Tell me MMM price,MMM,tell mmm price,MGM,0
43,Request for AT&T price,ATT,request att price,T,0
46,Gather for me GE's price,GE,gather ges price,IT,0
47,Source for me AT&T's price,AT&T,source atts price,T,0
