# Spending Classifier 

### Objective is to train a model that will be able to classify my credit card purchases into 4 classes : Food, Entertainment, Gas, & Other


Classes are mapped to integers as follows: 

- Food : 1 
- Entertainment : 2 
- Gas : 3 
- Other : 4 


In [101]:
import pandas as pd 



In [102]:
df = pd.read_excel('spendingdata.xlsx')


# Clean Data 


## Remove credit rows 


In [103]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 11 columns):
Transaction Date       226 non-null datetime64[ns]
 Posting Date          226 non-null datetime64[ns]
 Billing Amount        226 non-null float64
 Merchant              226 non-null object
 Merchant City         217 non-null object
 Merchant State        210 non-null object
 Merchant Zip          210 non-null float64
 Reference Number      226 non-null object
 Debit/Credit Flag     226 non-null object
 SICMCC Code           226 non-null int64
Class                  226 non-null object
dtypes: datetime64[ns](2), float64(2), int64(1), object(6)
memory usage: 19.5+ KB


In [104]:
df = df.drop(df[df[' Debit/Credit Flag '] == 'C'].index)

In [105]:
df

Unnamed: 0,Transaction Date,Posting Date,Billing Amount,Merchant,Merchant City,Merchant State,Merchant Zip,Reference Number,Debit/Credit Flag,SICMCC Code,Class
1,2017-03-01,2017-03-02,35.76,CUMBERLAND FA 91187210,CHELMSFORD,MA,1824.0,"""24231687061837008824001""",D,5542,G
2,2017-02-28,2017-03-02,18.54,PRESSED CAFE,NASHUA,NH,3062.0,"""24692167060000888528350""",D,5814,F
3,2017-02-27,2017-03-01,10.64,CAFE SERVICES - THE TA,NASHUA,NH,3062.0,"""24269797059500575945460""",D,5814,F
4,2017-02-25,2017-02-27,28.00,SENMONOROM RESTAURANT,LOWELL,MA,1851.0,"""24055237056400058000347""",D,5812,F
5,2017-02-25,2017-02-27,115.17,WAMESIT LANES - BOWLIN,TEWKSBURY,MA,1876.0,"""24269797057100636824796""",D,7933,E
6,2017-02-25,2017-02-27,17.74,Dig Inn Season Market - B,New York,NY,10001.0,"""24342857057700086431357""",D,5812,F
7,2017-02-24,2017-02-27,8.76,MCDONALD'S F11790,LOWELL,MA,1851.0,"""24427337055720055111502""",D,5814,F
8,2017-02-23,2017-02-24,12.66,UNCHARTED GALLERY,Lowell,MA,1852.0,"""24828247054001347856155""",D,5812,E
9,2017-02-23,2017-02-27,10.28,CAFE SERVICES - THE TA,NASHUA,NH,3062.0,"""24269797055500621300706""",D,5814,F
10,2017-02-22,2017-02-24,19.63,PRESSED CAFE,NASHUA,NH,3062.0,"""24692167054000130707461""",D,5814,F


In [106]:
df.columns

Index(['Transaction Date', ' Posting Date', ' Billing Amount', ' Merchant',
       ' Merchant City ', ' Merchant State ', ' Merchant Zip ',
       ' Reference Number ', ' Debit/Credit Flag ', ' SICMCC Code', 'Class'],
      dtype='object')

### Strip whitespace from column names 

In [107]:
df.columns = df.columns.str.strip()

### Delete  Unnecessary Columns 


In [108]:
df = df.drop('Reference Number',1)

In [109]:
df = df.drop('SICMCC Code',1)


In [110]:
df = df.drop('Merchant Zip',1)


In [111]:
df = df.drop('Debit/Credit Flag',1)


In [112]:
df = df.drop('Posting Date',1)

In [113]:
df = df.drop('Merchant City',1)
df = df.drop('Merchant State',1)

In [114]:
df['Merchant']

1         CUMBERLAND FA 91187210
2                   PRESSED CAFE
3         CAFE SERVICES - THE TA
4          SENMONOROM RESTAURANT
5         WAMESIT LANES - BOWLIN
6      Dig Inn Season Market - B
7              MCDONALD'S F11790
8              UNCHARTED GALLERY
9         CAFE SERVICES - THE TA
10                  PRESSED CAFE
11                EGYPTIAN GRILL
13              WALGREENS #11726
14                  PRESSED CAFE
15        EXXONMOBIL    97535595
16          WWW.ADVANCEAUTOPARTS
17         PHO 88 RESTAURANT INC
18                PAYPAL *NEIVSA
19               CHIPOTLE ONLINE
20              SHOWCASE CINEMAS
21           CVS/PHARMACY #01056
22           0628 PLANET FITNESS
23             BON CHON - LOWELL
24           PHO DA LAT STORE #1
25            VALENTINO'S MARKET
26                   EAT24 *MAZA
27        CAFE SERVICES - THE TA
28        BOSTON PARKING TICKETS
29        ICI*FEE BOSTON WEB PMT
30             NOODLES & CO 7903
31                   PACE ENERGY
          

### Clean Merchant Data 

In [115]:
clean_data = df['Merchant'].str.strip()

In [116]:
import re 

In [117]:
clean_data = df['Merchant'].map(lambda x: re.sub(r'[^a-zA-Z\s]','', x))

In [118]:
df['Merchant'] = clean_data

In [119]:
df

Unnamed: 0,Transaction Date,Billing Amount,Merchant,Class
1,2017-03-01,35.76,CUMBERLAND FA,G
2,2017-02-28,18.54,PRESSED CAFE,F
3,2017-02-27,10.64,CAFE SERVICES THE TA,F
4,2017-02-25,28.00,SENMONOROM RESTAURANT,F
5,2017-02-25,115.17,WAMESIT LANES BOWLIN,E
6,2017-02-25,17.74,Dig Inn Season Market B,F
7,2017-02-24,8.76,MCDONALDS F,F
8,2017-02-23,12.66,UNCHARTED GALLERY,E
9,2017-02-23,10.28,CAFE SERVICES THE TA,F
10,2017-02-22,19.63,PRESSED CAFE,F


### Convert Class to Integer 


In [120]:
def convert(s):
    class_map = {'F':1,'E':2,'G':3,'O':4}
    if s in class_map:
        return class_map[s]
    else:
        return 'O'
   

In [121]:
converted_class = df['Class'].map(convert)

                                           

In [122]:
df['Class'] = converted_class


### Feature Extraction, Bag of Words Bi-gram


In [123]:
from sklearn.feature_extraction.text import CountVectorizer 


In [124]:
# list of words that we want to exclude from vocabulary 
exclude_words =['the','llc','and']
# Only include words that have atleast 3 characters into our vocabulary 
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                     token_pattern=r'\b\w{3,}\b', min_df=1, stop_words=exclude_words)

In [125]:
counts = bigram_vectorizer.fit_transform(df['Merchant'])

In [126]:
counts

<216x298 sparse matrix of type '<class 'numpy.int64'>'
	with 644 stored elements in Compressed Sparse Row format>

In [127]:
counts.toarray().astype(int)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [128]:
names = bigram_vectorizer.get_feature_names()

names

In [129]:
names

['alewife',
 'amazon',
 'amazon mktplace',
 'amazoncom',
 'angelinas',
 'angelinas pizzeria',
 'apple',
 'apple store',
 'aramark',
 'aramark umass',
 'aramark uml',
 'auto',
 'auto body',
 'axscombowery',
 'axscombowery prsnts',
 'bamboo',
 'bar',
 'bar salem',
 'barbershop',
 'basket',
 'bath',
 'bath body',
 'bgood',
 'bgood nashua',
 'bistro',
 'bit',
 'bit bar',
 'body',
 'body works',
 'bon',
 'bon chon',
 'boston',
 'boston common',
 'boston parking',
 'boston web',
 'boston yummy',
 'bostonwoburn',
 'bowlin',
 'bread',
 'brook',
 'brook market',
 'brunswick',
 'brunswick zone',
 'burlington',
 'cafe',
 'cafe services',
 'care',
 'care success',
 'castles',
 'chargefee',
 'chargepurchases',
 'cheesecake',
 'cheesecake burlington',
 'chicken',
 'chicken rice',
 'chilis',
 'chilis lowell',
 'china',
 'china pearl',
 'china star',
 'chipotle',
 'chipotle online',
 'chon',
 'chon lowell',
 'cinemas',
 'cinemas lowell',
 'comics',
 'common',
 'common frog',
 'copley',
 'cuc',
 'cuc g

In [130]:
print (counts.shape)

(216, 298)


In [131]:
print (counts[0])

  (0, 72)	1


In [132]:
print (names[84])
print (names[113])
print (names[83])

egyptian
google
eatery


Counts contains is a numpy array that has num of occurences for each word in the vocab for each row, this is our data set that needs to be split and trained. 


In [133]:
import numpy as np 

In [134]:
train_counts = counts[:150]

In [135]:
test_counts = counts[150:]

In [136]:
# The class label we are predicting 
train_target = df['Class'][:150]

In [137]:
test_target = df['Class'][150:]

### Data has been split 70% train & 30% test 


In [138]:
### Train model using random forest 

In [139]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_counts, train_target)

In [140]:
train_counts_predict = forest.predict(train_counts)

In [141]:
from sklearn import metrics

In [142]:
print ("Accuracy: {0:.4f}".format(metrics.accuracy_score(train_target,train_counts_predict)))

Accuracy: 1.0000


In [143]:
test_counts_predict = forest.predict(test_counts)

In [144]:
print ("Accuracy: {0:.4f}".format(metrics.accuracy_score(test_target,test_counts_predict)))

Accuracy: 0.8333


In [145]:
test_counts_predict

array([4, 1, 1, 3, 1, 1, 2, 2, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 4, 4,
       1, 4, 1, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1])

In [146]:
result = zip(df['Merchant'][150:],test_counts_predict)
    

In [147]:
def numToClass(num):
    class_map = {1:'Food',2:"Entertainment",3:"Gas",4:'Other'}
    return class_map[num]

for merchant, prediction in result:
    print(merchant, numToClass(prediction))
    

RPS OF LOWELL ONSTREET ME Other
EGYPTIAN GRILL Food
ANGELINAS PIZZERIA AND SU Food
SPEEDWAY  Gas
MARKET BASKET  Food
SENMONOROM RESTAURANT Food
SHOWCASE CINEMAS Entertainment
SHOWCASE CINEMAS LOWELL Entertainment
RPS OF LOWELL ONSTREET ME Other
DUNKIN      Q Food
WENDYS  Food
DOMINOS  Food
EGYPTIAN GRILL Food
CHIPOTLE  Food
RPS OF LOWELL ONSTREET ME Other
NAYAX LLC  Food
BOSTON YUMMY LLC Food
GAINSBOROUGH GARAGE Food
BON CHON  LOWELL Food
BON CHON  LOWELL Food
INTEREST CHARGEPURCHASES Other
INTEREST CHARGEFEE Other
LATE PAYMENT FEE Other
CHIPOTLE  Food
TARGET         Other
EGYPTIAN GRILL Food
CVSPHARMACY  Other
GULF OIL  Gas
DUNKIN      Q Food
CAFE SERVICES  THE TA Food
CAFE SERVICES  THE TA Food
CAFE SERVICES  THE TA Food
CAFE SERVICES  THE TA Food
  BRUNSWICK ZONE  Food
  BRUNSWICK ZONE  Food
SPIT BROOK MARKET Food
PRESSED CAFE Food
MARKET BASKET  Food
TAVERN IN THE SQUARE Food
PHO DA LAT STORE  Food
LUKOIL Food
CHINA STAR Food
BGOOD NASHUA Food
PRESSED CAFE Food
CAFE SERVICES  THE T

In [148]:
df[df['Class'] == 1]

Unnamed: 0,Transaction Date,Billing Amount,Merchant,Class
2,2017-02-28,18.54,PRESSED CAFE,1
3,2017-02-27,10.64,CAFE SERVICES THE TA,1
4,2017-02-25,28.00,SENMONOROM RESTAURANT,1
6,2017-02-25,17.74,Dig Inn Season Market B,1
7,2017-02-24,8.76,MCDONALDS F,1
9,2017-02-23,10.28,CAFE SERVICES THE TA,1
10,2017-02-22,19.63,PRESSED CAFE,1
11,2017-02-21,10.50,EGYPTIAN GRILL,1
14,2017-02-21,14.45,PRESSED CAFE,1
17,2017-02-20,39.00,PHO RESTAURANT INC,1
