# Imports and Get Data

In [2]:
import json
import pandas as pd
import re
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
#from sklearn.metrics import roc_curve, auc, roc_auc_score
import sklearn.metrics # import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import sklearn.model_selection as ms
import time

In [3]:
data = pd.read_json('./train_data/merchantSwipeDump_old.json')

In [4]:
data.sample(20)

Unnamed: 0,mapped_brand,mcc,merchant_string,network
264620,,5621.0,VICTORIA'S SECRET 0144,V
298064,,4121.0,UBER *TRIP QZ66O,V
42906,,,CASEYS 4301 STONE A,
143923,,7298.0,MAY FLOWER SPA,V
58126,,5814.0,CHIPOTLE 0666,V
247795,,7832.0,TWIN CREEK CONCESSIONS,V
54769,,7999.0,CHOCTAW C&R THE DISTRI,V
74995,,5812.0,HAPPY GUY CHINESE CUIS,V
280603,,5541.0,WINGATE CITGO FO,V
302548,,,UBER *TRIP WD7AB,


# Cleaning Functions

In [5]:
def preprocess(df):
    ## Remove ".0" from MCC column
    df['mcc']=df['mcc'].apply(lambda x: str(x).strip(".0"))
    ## Change mcc nan to NaN
    df['mcc'] = np.where(df['mcc']=='nan', np.nan, df['mcc'])
    df['merchant_string'] = df['merchant_string'].apply(lambda x: x.lower())

In [6]:
## Map by MCC as well
mcc_dict = {'6011': 'atm', '6010': 'atm', '7523':'parking'}

def mcc_dict_funct(df, col_origin, col_output, mcc_dict):
    for key, value in mcc_dict.items():
            df[col_output] = np.where(df[col_origin]==key, value, df[col_output])

In [7]:
def most_common_words(df, col):
    # Turn merchant string into list and flatten list of sublists
    words_merchant_string_2 = [elem for sublist in df[col].tolist() for elem in sublist]

    # Get DataFrame of words with their count
    wordcnt_df = pd.DataFrame.from_dict(dict(Counter(words_merchant_string_2)), orient='index')\
        .reset_index().rename(columns={"index": "keyword", 0: "cnt"})\
        .sort_values(by='cnt',ascending=False)
    most_common_words = list(wordcnt_df['keyword'][0:1000])
    most_common_words1 = ["\\" + x if x[0] == "*" else x for x in most_common_words]
    return wordcnt_df, most_common_words, most_common_words1

In [8]:
def dummify_data(df, most_common_words1):
    for keyword in most_common_words1:
        df[keyword] = np.where(df['merchant_string'].str.contains(keyword),1,0)

# Clean and Prepare Data

In [11]:
import sys
sys.path.insert(0, '../utility_files')
import cleanData

preprocess(data)
data['mapped_brand_response'] = ""
mcc_dict_funct(data, 'mcc', 'mapped_brand_response', mcc_dict)

remove_string = '^[0-9]*[0-9]$|^www$|^com$|^ave|^street$|^road$|^and$|^inc$|^at$|^drive$|^of$|^main$|^the$|^[ewns]$|^#|^[0-9]*th$|^3rd$|^2nd$|^1st$|^store$|^st$|^rd$|^blvd$|^hwy$|^dr$'
split_string = '[-./ ]'

cleanData.clean(data, old_col='merchant_string', col='merchant_string1',split_string=split_string,
  remove_string = remove_string,lowercase=False, remove_empty_strings_bi=True,
  join_mcc_bi=False,rejoin_col_strings_bi=False)

wordcnt_df, most_common_words, most_common_words1 = most_common_words(data,'merchant_string1')

data_dummified = data.copy()

In [12]:
# Dummify data (on most common words in merchant string cleaned)
start_time = time.time()
dummify_data(data_dummified, most_common_words1)  

# Dummify additional columns (mcc, network) and drop merchant string column
data_dummified=pd.get_dummies(data_dummified, prefix=['mcc', 'network'], columns=['mcc', 'network'])

#Delete merchant_string_columns
del_cols = ['mapped_brand_response']
for x in data_dummified.columns:
    if bool(re.search('^merchant_string',x)):
        del_cols.append(x)

#data_dummified.columns.contain(del_cols), axis = 1)
data_dummified =data_dummified.drop(del_cols, axis = 1)

print("--- %s seconds ---" % (time.time() - start_time))
#takes about 7 mins (417 sec)

--- 196.6222379207611 seconds ---


# Modelling Functions

In [13]:
# Train/Test/Holdout
def train_test_holdout(df):
    # Train and Test from original full dfset
    train_full = df[df.mapped_brand.notna()]
    test = df[df.mapped_brand.isna()]

    # Split into X and y for each set
    X_train_full = train_full.drop('mapped_brand', axis=1)
    y_train_full = train_full['mapped_brand']
    X_test = test.drop('mapped_brand', axis=1)
    y_test = test['mapped_brand']

    # Train/Holdout split
    X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout = train_test_split(
         X_train_full, y_train_full, test_size=0.2, random_state=42)

    # Combine X and Y columns for Holdout and Train wo Holdout
#     holdout = train_full[train_full.index.isin(X_holdout.index.values)]
#     train_wo_holdout = train_full[train_full.index.isin(X_train_wo_holdout.index.values)]
    return X_train_full, y_train_full, X_test, y_test, X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout

def multinomial_regression(X_train_wo_holdout, y_train_wo_holdout,X_holdout, y_holdout):
    multinomial = LogisticRegression(multi_class='multinomial', random_state = 42, solver='lbfgs', C=1e5, class_weight = 'balanced') 
    multinomial.fit(X_train_wo_holdout, y_train_wo_holdout)
    mapped_brand_proba = multinomial.predict_proba(X_holdout)
    
    probs = pd.DataFrame(mapped_brand_proba)
    probs['max_prob'] = probs.max(axis = 1)
    probs = probs['max_prob']
    
    mapped_brand_predicted = multinomial.predict(X_holdout)
    
    score = multinomial.score(X_holdout, y_holdout)
    
    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    
    return multinomial, mapped_brand_proba, probs, mapped_brand_predicted, score, mapped_brand_predicted1

def multinomial_output_train(df, mapped_brand_predicted1, probs):
#    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    X_holdout1 = df.assign(mapped_brand_predicted1=mapped_brand_predicted1.values)[['mapped_brand_predicted1']]
    X_holdout1 = X_holdout1.assign(probs = probs.values)
    
    output = data.join(X_holdout1, how = 'inner')
    output['mcc'] = output['mcc'].fillna(-1)
    output['network'] = output['network'].fillna(-1)
    output['mapped_brand_response'] = np.where(output['mapped_brand_response']=='', output['mapped_brand_predicted1'], output['mapped_brand_response'])
    #can add rule about replacing item in column when probability is above a certain threshold
    output.drop('mapped_brand_predicted1',axis=1, inplace = True)
    
    # Add flag on whether mapped brand and predicted mapped brand are same
    output['correct_flag'] = np.where(output['mapped_brand'] == output['mapped_brand_response'], 1, 0)

    correct_overall = output.agg(['sum','count','mean'])[['correct_flag']]
    
    correct_by_brand = output.groupby('mapped_brand').agg(['sum','count','mean'])['correct_flag'].reset_index()\
    .sort_values(by='count',ascending=False)\
    .rename(columns={'sum':'nbr_correct', 'count':'nbr_records', 'mean':'pct_correct'})
    
    return output, correct_overall, correct_by_brand

def multinomial_output_test(df, mapped_brand_predicted1, probs):
#    mapped_brand_predicted1 = pd.DataFrame(mapped_brand_predicted)
    X_holdout1 = df.assign(mapped_brand_predicted1=mapped_brand_predicted1.values)[['mapped_brand_predicted1']]
    X_holdout1 = X_holdout1.assign(probs = probs.values)
    
    output = data.join(X_holdout1, how = 'inner')
    output['mcc'] = output['mcc'].fillna(-1)
    output['network'] = output['network'].fillna(-1)
    output['mapped_brand_response'] = np.where(output['mapped_brand_response']=='', output['mapped_brand_predicted1'], output['mapped_brand_response'])
    #can add rule about replacing item in column when probability is above a certain threshold
    output.drop('mapped_brand_predicted1',axis=1, inplace = True)
    
    # Add flag on whether mapped brand and predicted mapped brand are same
    output['correct_flag'] = np.where(output['mapped_brand'] == output['mapped_brand_response'], 1, 0)
    
    return output


mapping_dict = {'vend': 'vendingmachine', #'usa': 'vendingmachine',
                'wal-mart': 'walmart', 'walmart': 'walmart', 'wm supercenter': 'walmart',
                'uber ': 'uber', # or ubereats
                'paypal': 'paypal',
                "mcdonald's": 'mcdonalds', 'mcdonalds': 'mcdonalds',
                'target t-': 'target', 'target.com': 'target',
                'cvs/pharm': 'cvs',
                'walgreens': 'walgreens',
                'starbucks': 'starbucks', 
                'chick-fil-a': 'chickfila',
                'gamestop': 'gamestop', 
                'google \*': 'googleplay', # or google,
                'kroger': 'kroger',
                'chipotle': 'chipotle',
                'apl\* itunes.com/bill': 'appleitunes', # needs a \ before *
                'dunkin': 'dunkindonuts',
                'amazon': 'amazon',
                'lyft': 'lyft',
                '7-eleven': 'seveneleven', '7 eleven': 'seveneleven',
                "victoria's secret": 'victoriassecret', 'victoriassecret.com':'victoriassecret',
                'etsy.com': 'etsy', 'etsy': 'etsy',
                'duane reade': 'duanereade',
                'taco bell': 'tacobell',
                'dollar-general': 'dollargeneral', 'dollar general': 'dollargeneral', 'dollar ge': 'dollargeneral',
                "wendy's": 'wendys', 'wendys': 'wendys',
                'amc ': 'amc',
                'safeway store': 'safeway', 'safeway': 'safeway',
                'panera bread': 'panerabread',
                'subway restaurant': 'subway',
                'sonic': 'sonic',
                'rite aid store': 'riteaidpharmacy',
                'chevron/': 'chevron',
                'forever 21': 'forever21',
                'dollar tr': 'dollartree',
                "claire's": 'claires',
                'dairy queen': 'dairyqueen',
                "sq \*tomy's": 'tomys', # needs a \ before *
                'qt ': 'quiktrip',
                'microsoft ': 'microsoft',
                'ulta.com': 'ultabeauty', 'ulta #': 'ultabeauty',
                'playstation network': 'playstation',
                'barnes an': 'barnesandnoble', 'barnes & noble': 'barnesandnoble', 'barnesnob': 'barnesandnoble',
                'burger king': 'burgerking',
                'riotgam\*': 'riotgames',
                'michaels stores': 'michaels',
                'sephora': 'sephora',
                'five guys': 'fiveguys', '5guys': 'fiveguys',
                'five below': 'fivebelow',
                'bath and body works': 'bathandbodyworks', 'bath & body works' : 'bathandbodyworks',
                'shake shack': 'shakeshack',
                'chopt': 'chopt',
                'urban-out': 'urbanoutfitters', 'urban out': 'urbanoutfitters',
                "domino's": 'dominos',
                'regal cinemas': 'regalcinemas', 'edwards':'regalcinemas',
                'circle k': 'circlek',
                'sweetgreen': 'sweetgreen',
                'wholefds': 'wholefoods',
                'coca cola': 'cocacola', 'coca-cola': 'cocacola',
                'nyctaxi': 'nyctaxi', 'nyc taxi': 'nyctaxi',
                'shell': 'shell',
                'pacsun': 'pacsun',
                'tjmaxx': 'tjmaxx', 't j maxx': 'tjmaxx', 'tj maxx': 'tjmaxx',
                'toys r us': 'toysrus',
                'lush us': 'lush', 'lush upper west': 'lush',
                'best buy': 'bestbuy',
                'steamgames.com': 'steam',
                'jamba juice': 'jambajuice',
                'jimmy johns': 'jimmyjohns'
               }
def mapping_dict_funct(df, col_origin, col_output, mapping_dict):
    for key, value in mapping_dict.items():
        df[col_output] = np.where(df[col_origin].str.contains(key), value, df[col_output])

# Modelling on Train

In [15]:
data_dummified3 = data_dummified[(data_dummified['mcc_6011'] != 1)]
data_dummified4 = data_dummified3[(data_dummified3['mcc_7523'] != 1)]

In [16]:
%%time
# Test/Train/Holdout split, saving off each df returned
X_train_full, y_train_full, X_test, y_test, X_train_wo_holdout, X_holdout, y_train_wo_holdout, y_holdout = \
train_test_holdout(data_dummified4) 

# can add input parameter for 80/20 split
#takes about 14 seconds to run

CPU times: user 3.02 s, sys: 3.31 s, total: 6.33 s
Wall time: 6.25 s


In [17]:
%%time
multinomial_train, mapped_brand_proba_train, probs_train, mapped_brand_predicted_train, score_train, mapped_brand_predicted1_train = multinomial_regression(X_train_wo_holdout, y_train_wo_holdout, X_holdout, y_holdout)

CPU times: user 1min 36s, sys: 3.51 s, total: 1min 39s
Wall time: 28.4 s


In [18]:
score_train

0.9525831564048125

In [16]:
output, correct_overall, correct_by_brand = multinomial_output_train(X_holdout, mapped_brand_predicted1_train, probs_train)

In [18]:
output.head(5)

Unnamed: 0,mapped_brand,mcc,merchant_string,network,mapped_brand_response,merchant_string1,probs,correct_flag
314731,starbucks,5814,starbucks store 17000,V,starbucks,[starbucks],1.0,1
314734,starbucks,5814,starbucks store 00885,V,starbucks,[starbucks],1.0,1
314737,starbucks,5814,starbucks store 47931,V,starbucks,[starbucks],1.0,1
314748,starbucks,5814,starbucks store 29856,V,starbucks,[starbucks],1.0,1
314750,starbucks,5814,starbucks store 21929,V,starbucks,[starbucks],1.0,1


In [21]:
len(output[(output['correct_flag'] == 1) & \
           (output['mcc'] !='6011') & \
           (output['mcc'] !='7523')])/len(output[(output['mcc'] !='6011') & (output['mcc'] !='7523')])

0.9418026969481902

# Modelling on Test

In [129]:
#y_train_full1 = y_train_full[(y_train_full['mcc'] !='6011') & (y_train_full['mcc'] !='7523')]
#y_train_full.head(5)

In [130]:
%%time
#THIS TAKES ABOUT 8 MINS TO RUN
y_test.fillna("Unknown", inplace = True)
#testing model on test using full train set
multinomial, mapped_brand_proba, probs, mapped_brand_predicted, score, mapped_brand_predicted1 = multinomial_regression(X_train_full, y_train_full, X_test, y_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


CPU times: user 3min 34s, sys: 1min 2s, total: 4min 37s
Wall time: 3min 8s


In [131]:
%%time
output = multinomial_output_test(X_test, mapped_brand_predicted1, probs)

CPU times: user 2.29 s, sys: 4.41 s, total: 6.7 s
Wall time: 4.98 s


In [132]:
new_merchant_col(output, 'merchant_string', 'merchant_string_dict') # so not cleaned
lowercase_col(output, 'merchant_string_dict') # dictionary below needs lowercase to work
output['mapped_brand_dict_3'] = ''

# Run dictionary functions on data
mapping_dict_funct(output, 'merchant_string_dict', 'mapped_brand_dict_3', mapping_dict)
mcc_dict_funct(output, 'mcc', 'mapped_brand_dict_3', mcc_dict)
# Create new column flags for predicted, correct, train/test, row count
# Can be used to summarize, if desired
output['predicted_flag'] = np.where(output['mapped_brand_dict_3'] != '',1,0)
output['equals_prediced_flag_1'] = np.where(output['mapped_brand_response'] == output['mapped_brand_dict_3'],1,0)

In [24]:
#test_model_output = output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] == 1)]

In [None]:
%%time
test_accuracy = test_model_output.mean()[['equals_prediced_flag_1']]
test_accuracy = test_model_output.mean()[['equals_predicted_flag']]
###TAKES 46 MINUTES TO RUN
####ACCURACY: 0.99127!!!!!!
#####Potentially faster ways to get accuracy
##1
#test_model_output['equals_prediced_flag_1'].mean()
##2
#incorrect_test_model_output = output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] != 1)]
#len(incorrect_test_model_output)/len(test_model_output)

In [28]:
test_accuracy

equals_prediced_flag_1    0.99127
dtype: float64

In [30]:
test_model_output[test_model_output['equals_predicted_flag_1'] == 0]
# ideas: remove ATM/parking, a number of best buy/burger kind wrong, maybe remove edwards or vend from dictionary, maybe stopword restaurant

In [140]:
#output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] != 1)].sample(30)

In [141]:
from sklearn.metrics import confusion_matrix

In [None]:
output[(output['mcc'] !='6011') & (output['mcc'] !='7523') & (output['predicted_flag'] == 1)].sample(5)

In [None]:
conf_mat = confusion_matrix(test_model_output['mapped_brand_dict_3'], test_model_output['mapped_brand_response'])
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=test_model_output['mapped_brand_response'].values, yticklabels=test_model_output['mapped_brand_dict_3'].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [143]:
test_model_output['mapped_brand_dict_3'].value_counts().head(20)

uber             24166
paypal           11552
mcdonalds         9155
starbucks         4566
dollargeneral     3947
walmart           3788
walgreens         3650
tacobell          2991
gamestop          2777
dunkindonuts      2743
cvs               2637
target            2629
dollartree        2410
wendys            2360
burgerking        2181
kroger            2023
sonic             1813
googleplay        1770
chickfila         1700
dominos           1508
Name: mapped_brand_dict_3, dtype: int64

In [None]:
pd.DataFrame(test_model_output['mapped_brand_response'].unique())

# Pickling

In [133]:
import pickle
import sys
def save_as_pickled_object(obj, filepath):
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(obj)
    n_bytes = sys.getsizeof(bytes_out)
    with open(filepath, 'wb') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])


save_as_pickled_object(multinomial,'final_ML_model.sav')

import os
def try_to_load_as_pickled_object_or_None(filepath):
            """
            This is a defensive way to write pickle.load, allowing for very large files on all platforms
            """
            max_bytes = 2**31 - 1

            input_size = os.path.getsize(filepath)
            bytes_in = bytearray(0)
            with open(filepath, 'rb') as f_in:
                for _ in range(0, input_size, max_bytes):
                    bytes_in += f_in.read(max_bytes)
            obj = pickle.loads(bytes_in)

            return obj

multinomial2 = try_to_load_as_pickled_object_or_None('final_ML_model.sav')

In [138]:
%%time
#len(multinomial2.predict(X_test))
#314152
#len(multinomial.predict(X_test))
#314152
sum(multinomial2.predict(X_test) != multinomial.predict(X_test))

CPU times: user 50.7 s, sys: 31.5 s, total: 1min 22s
Wall time: 1min 2s


0

In [None]:
#X_holdout1 = X_holdout.copy()
#X_holdout1['multinom_pickle'] = multinomial2.predict(X_holdout)
#X_holdout1['multinom'] = multinomial_train.predict(X_holdout)
#X_holdout1[X_holdout1['multinom_pickle'] != X_holdout1['multinom']]