# Spending Classifier 

### Objective is to train a model that will be able to classify my credit card purchases into 4 classes : Food, Entertainment, Gas, & Other


Classes are mapped to integers as follows: 

- Food : 1 
- Entertainment : 2 
- Gas : 3 
- Other : 4 


In [28]:
import pandas as pd 



In [29]:
df = pd.read_excel('spendingdata.xlsx')


# Clean Data 


## Remove credit rows 


In [30]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 11 columns):
Transaction Date       226 non-null datetime64[ns]
 Posting Date          226 non-null datetime64[ns]
 Billing Amount        226 non-null float64
 Merchant              226 non-null object
 Merchant City         217 non-null object
 Merchant State        210 non-null object
 Merchant Zip          210 non-null float64
 Reference Number      226 non-null object
 Debit/Credit Flag     226 non-null object
 SICMCC Code           226 non-null int64
Class                  226 non-null object
dtypes: datetime64[ns](2), float64(2), int64(1), object(6)
memory usage: 14.2+ KB


In [31]:
df = df.drop(df[df[' Debit/Credit Flag '] == 'C'].index)

In [32]:
df

Unnamed: 0,Transaction Date,Posting Date,Billing Amount,Merchant,Merchant City,Merchant State,Merchant Zip,Reference Number,Debit/Credit Flag,SICMCC Code,Class
1,2017-03-01,2017-03-02,35.76,CUMBERLAND FA 91187210,CHELMSFORD,MA,1824.0,"""24231687061837008824001""",D,5542,G
2,2017-02-28,2017-03-02,18.54,PRESSED CAFE,NASHUA,NH,3062.0,"""24692167060000888528350""",D,5814,F
3,2017-02-27,2017-03-01,10.64,CAFE SERVICES - THE TA,NASHUA,NH,3062.0,"""24269797059500575945460""",D,5814,F
4,2017-02-25,2017-02-27,28.00,SENMONOROM RESTAURANT,LOWELL,MA,1851.0,"""24055237056400058000347""",D,5812,F
5,2017-02-25,2017-02-27,115.17,WAMESIT LANES - BOWLIN,TEWKSBURY,MA,1876.0,"""24269797057100636824796""",D,7933,E
6,2017-02-25,2017-02-27,17.74,Dig Inn Season Market - B,New York,NY,10001.0,"""24342857057700086431357""",D,5812,F
7,2017-02-24,2017-02-27,8.76,MCDONALD'S F11790,LOWELL,MA,1851.0,"""24427337055720055111502""",D,5814,F
8,2017-02-23,2017-02-24,12.66,UNCHARTED GALLERY,Lowell,MA,1852.0,"""24828247054001347856155""",D,5812,E
9,2017-02-23,2017-02-27,10.28,CAFE SERVICES - THE TA,NASHUA,NH,3062.0,"""24269797055500621300706""",D,5814,F
10,2017-02-22,2017-02-24,19.63,PRESSED CAFE,NASHUA,NH,3062.0,"""24692167054000130707461""",D,5814,F


In [33]:
df.columns

Index(['Transaction Date', ' Posting Date', ' Billing Amount', ' Merchant',
       ' Merchant City ', ' Merchant State ', ' Merchant Zip ',
       ' Reference Number ', ' Debit/Credit Flag ', ' SICMCC Code', 'Class'],
      dtype='object')

### Strip whitespace from column names 

In [34]:
df.columns = df.columns.str.strip()

### Delete  Unnecessary Columns 


In [35]:
df = df.drop('Reference Number',1)

In [36]:
df = df.drop('SICMCC Code',1)


In [37]:
df = df.drop('Merchant Zip',1)


In [38]:
df = df.drop('Debit/Credit Flag',1)


In [39]:
df = df.drop('Posting Date',1)

In [40]:
df = df.drop('Merchant City',1)
df = df.drop('Merchant State',1)

In [41]:
df['Merchant']

1         CUMBERLAND FA 91187210
2                   PRESSED CAFE
3         CAFE SERVICES - THE TA
4          SENMONOROM RESTAURANT
5         WAMESIT LANES - BOWLIN
6      Dig Inn Season Market - B
7              MCDONALD'S F11790
8              UNCHARTED GALLERY
9         CAFE SERVICES - THE TA
10                  PRESSED CAFE
11                EGYPTIAN GRILL
13              WALGREENS #11726
14                  PRESSED CAFE
15        EXXONMOBIL    97535595
16          WWW.ADVANCEAUTOPARTS
17         PHO 88 RESTAURANT INC
18                PAYPAL *NEIVSA
19               CHIPOTLE ONLINE
20              SHOWCASE CINEMAS
21           CVS/PHARMACY #01056
22           0628 PLANET FITNESS
23             BON CHON - LOWELL
24           PHO DA LAT STORE #1
25            VALENTINO'S MARKET
26                   EAT24 *MAZA
27        CAFE SERVICES - THE TA
28        BOSTON PARKING TICKETS
29        ICI*FEE BOSTON WEB PMT
30             NOODLES & CO 7903
31                   PACE ENERGY
          

### Clean Merchant Data 

In [42]:
clean_data = df['Merchant'].str.strip()

In [43]:
import re 

In [44]:
clean_data = df['Merchant'].map(lambda x: re.sub(r'[^a-zA-Z\s]','', x))

In [45]:
df['Merchant'] = clean_data

In [46]:
df

Unnamed: 0,Transaction Date,Billing Amount,Merchant,Class
1,2017-03-01,35.76,CUMBERLAND FA,G
2,2017-02-28,18.54,PRESSED CAFE,F
3,2017-02-27,10.64,CAFE SERVICES THE TA,F
4,2017-02-25,28.00,SENMONOROM RESTAURANT,F
5,2017-02-25,115.17,WAMESIT LANES BOWLIN,E
6,2017-02-25,17.74,Dig Inn Season Market B,F
7,2017-02-24,8.76,MCDONALDS F,F
8,2017-02-23,12.66,UNCHARTED GALLERY,E
9,2017-02-23,10.28,CAFE SERVICES THE TA,F
10,2017-02-22,19.63,PRESSED CAFE,F


### Convert Class to Integer 


In [47]:
def convert(s):
    class_map = {'F':1,'E':2,'G':3,'O':4}
    if s in class_map:
        return class_map[s]
    else:
        return 'O'
   

In [48]:
converted_class = df['Class'].map(convert)

                                           

In [49]:
df['Class'] = converted_class


### Feature Extraction, Bag of Words Bi-gram


In [53]:
from sklearn.feature_extraction.text import CountVectorizer 


In [54]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                     token_pattern=r'\b\w+\b', min_df=1)

In [55]:
counts = bigram_vectorizer.fit_transform(df['Merchant'])

In [56]:
counts

<216x356 sparse matrix of type '<class 'numpy.int64'>'
	with 834 stored elements in Compressed Sparse Row format>

In [57]:
counts.toarray().astype(int)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [58]:
names = bigram_vectorizer.get_feature_names()

names

In [59]:
names

['a',
 'a wrap',
 'alewife',
 'amazon',
 'amazon mktplace',
 'amazoncom',
 'and',
 'and rice',
 'and su',
 'angelinas',
 'angelinas pizzeria',
 'apple',
 'apple store',
 'aramark',
 'aramark umass',
 'aramark uml',
 'auto',
 'auto body',
 'axscombowery',
 'axscombowery prsnts',
 'b',
 'bamboo',
 'bar',
 'bar llc',
 'bar salem',
 'barbershop',
 'basket',
 'bath',
 'bath body',
 'bgood',
 'bgood nashua',
 'bistro',
 'bit',
 'bit bar',
 'body',
 'body works',
 'bon',
 'bon chon',
 'boston',
 'boston common',
 'boston parking',
 'boston web',
 'boston yummy',
 'bostonwoburn',
 'bowlin',
 'bread',
 'brook',
 'brook market',
 'brunswick',
 'brunswick zone',
 'burlington',
 'c',
 'c p',
 'cafe',
 'cafe services',
 'care',
 'care to',
 'castles',
 'castles llc',
 'chargefee',
 'chargepurchases',
 'cheesecake',
 'cheesecake burlington',
 'chicken',
 'chicken and',
 'chilis',
 'chilis lowell',
 'china',
 'china pearl',
 'china star',
 'chipotle',
 'chipotle online',
 'chon',
 'chon lowell',
 'ci