# Train model

## Abstract

- I label my expenditure type such as 'Goods', 'Foods', 'Groceries'
- Manually labeling is difficult
- Use ML instead
- I chose [Extream Gradient Boosting](http://xgboost.readthedocs.io/en/latest/model.html)
    - it's faster to train yet its performance is great

# 1. Import library

- pandas for data frame
- numpy for data processing
- matplotlib for plotting
- sklearn for modelling
- xgboost for modelling

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

import re
%matplotlib inline

# 2. Global variable

- add the previously generated summary file
    - call it 'train'
- add the newly generated expenditure lists
    - call it 'test'

In [57]:
data = {
    'train': 'data/data.csv',
    'test': 'data/Chase7899_Activity_20170211.CSV'    
}
dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%y')
train = pd.read_csv(data['train'], parse_dates=['Trans Date', 'Post Date'], date_parser=dateparse)
train.head(5)

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind,Expense
0,Sale,2016-12-31,2017-01-01,PAYPAL *DIGIZONE,-2.17,Goods,1
1,Sale,2016-12-31,2017-01-01,SQ *COOL TEA BAR WAVERLY,-8.72,Foods,1
2,Sale,2016-12-30,2017-01-01,TARGET 00003202,-29.61,Goods,1
3,Sale,2016-12-30,2017-01-01,99 RANCH #1769,-47.59,Groceries,1
4,Sale,2016-12-30,2017-01-01,A&amp;A GAS,-23.62,Auto,1


# 3. Preprocessing

- Sum of categories have very few occurance
- Those are better to be dropped for performance

In [58]:
gb = train.groupby('Kind')
gb = gb.count()['Type']
#drop_quantile = gb.quantile(.1)
drop_index = gb[gb <= 5].index.tolist()
drop_index

['Apt',
 'Etc',
 'Flight',
 'Internet',
 'Membership',
 'Moving',
 'Music',
 'Parking',
 'Travel',
 'Unknown']

In [59]:
train = train[~train.Kind.isin(drop_index)]

# 4. Define helper methods

- text process and vectorize description
- one-hot encode dataframe

In [60]:
def preprocess_X(X:pd.DataFrame, processors=dict(), train=True):
    if not train and not processors:
        raise Exception("Processor should be given when train = False")
        
    def keep_only_ABCs_and_lower(text):
        except_letters = re.compile(r'(?![A-z]).')
        return except_letters.sub(" ", text.lower())

    def convert_to_single_space(text):
        multiple_whitespace = re.compile(r'\s+')
        assert isinstance(text, str), text
        return multiple_whitespace.sub(" ", text)

    def keep_only_length_greater_than_2(text):
        return " ".join(word for word in text.split(' ') if len(word) > 2)

    def text_process(text):
        text = keep_only_ABCs_and_lower(text)
        text = keep_only_length_greater_than_2(text)
        text = convert_to_single_space(text)
        return text
    
    scaler = processors.get('Scaler', preprocessing.StandardScaler())
    vectorizer = processors.get('Vectorizer', CountVectorizer())
    text_processor = processors.get('TextProcessor', text_process)
    
    features = ['Type', 'Description', 'Amount']
    assert all(feat in X.columns for feat in features), "X does not have correct headers"
    X_features = X[features]

    X_TYPE_ONE_HOT = pd.get_dummies(X_features['Type']) # Payment, Return, Sales \n 1, 0, 0 \n ....
    X_DESCRIPTION = X_features['Description'].apply(text_processor)
    if train:
        X_DESCRIPTION = vectorizer.fit_transform(X_DESCRIPTION).toarray()
    else:
        X_DESCRIPTION = vectorizer.transform(X_DESCRIPTION).toarray()
    
    X_AMOUNT = X_features['Amount'].values.reshape(-1, 1)
    
    X_processed = np.hstack((X_TYPE_ONE_HOT, X_DESCRIPTION, X_AMOUNT))
    
    if train:
        X_processed = scaler.fit_transform(X_processed)
    else:
        X_processed = scaler.transform(X_processed)
    
    processors = {
        'Scaler': scaler,
        'Vectorizer': vectorizer,
        'TextProcessor': text_processor
    }
    
    return X_processed, processors

def preprocess_Y(X:pd.DataFrame, name='Kind') -> (np.array, pd.Index):
    """ Encode str label to int
    """
    return pd.factorize(X[name])

In [61]:
X, processors = preprocess_X(train)
y, y_label = preprocess_Y(train)
X.shape, y.shape

((371, 242), (371,))

# 5. Train/Test Split

- there is no validation set because I'm going to ues a cross-validation in running-time

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [63]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((296, 242), (296,), (75, 242), (75,))

# 6. Model with XGBoost
- `RandomizedSearchCV`: to search the best hyperparameters
- I know test score is not great but it's good enough for me now
- train takes around 10 minutes

In [64]:
parameters = {'max_depth':[3,4,5], 'n_estimators':[100, 500, 1000], 'reg_lambda':[0.7, 1.0]}
gbm = xgb.XGBClassifier(objective='multi:softmax')
clf = RandomizedSearchCV(gbm, parameters)

In [65]:
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [3, 4, 5], 'n_estimators': [100, 500, 1000], 'reg_lambda': [0.7, 1.0]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [66]:
clf.score(X_test, y_test)

0.68000000000000005

# 7. Predict labels for test data

In [72]:
X_new, _ = preprocess_X(pd.read_csv(data['test']), processors, train=False)
y_new = clf.predict(X_new)
df = pd.read_csv(data['test'])
df['Kind'] = [y_label[y] for y in y_new]
df.head(10)

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,01/31/2017,02/02/2017,WALGREENS #6655,-22.8,Groceries
1,Sale,01/31/2017,02/01/2017,PAYPAL *AFFORDAMACL,-2.99,Goods
2,Return,01/30/2017,01/30/2017,Gilroy,4.48,Return
3,Sale,01/30/2017,01/31/2017,NAUTICA #73,-11.66,Foods
4,Return,01/30/2017,01/30/2017,Gilroy,4.48,Return
5,Sale,01/30/2017,01/31/2017,COSTCO GAS #0422,-25.0,Auto
6,Return,01/30/2017,01/30/2017,Gilroy,28.72,Goods
7,Sale,01/29/2017,01/31/2017,IN-N-OUT BURGER #100,-17.85,Foods
8,Sale,01/29/2017,01/31/2017,SKECHERS-USA #7,-73.22,Clothing
9,Sale,01/29/2017,01/30/2017,Gilroy,-24.25,Foods


# 8. Post processing

- Save to csv
- Fix incorrect labels manually using spreadsheet tools

In [None]:
assert 1==0 # Stop here: below it's a merge step

In [75]:
df.to_csv('data/data_processed.csv', index=False)

# 9. Merge step
- create a final processed ready-for-analysis file

In [85]:
df = pd.read_csv("data/data_processed.csv", parse_dates=['Trans Date', 'Post Date'], date_parser=dateparse)
df.head()

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,2017-01-31,2017-02-02,WALGREENS #6655,-22.8,Groceries
1,Sale,2017-01-31,2017-02-01,PAYPAL *AFFORDAMACL,-2.99,Goods
2,Return,2017-01-30,2017-01-30,Gilroy,4.48,Return
3,Sale,2017-01-30,2017-01-31,NAUTICA #73,-11.66,Clothing
4,Return,2017-01-30,2017-01-30,Gilroy,4.48,Return


In [86]:
col_names = ['Type', 'Trans Date', 'Post Date', 'Description', 'Amount', 'Kind']

In [88]:
merged = train[col_names].append(df, ignore_index=True)

In [90]:
merged.to_csv("data/final/merged_feb_2016.csv", index=False)