# Train a Model

In this script, I'm going to train a model that will predict what cateogries my card spending belong to.

Some categories include:

* Goods
* Foods
* Groceries
* Coffee
* and more

## Dependencies

In [1]:
import pandas as pd
import re
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

## Datasets

There are two files

* My past credit card usage summary -> train
* This month credit card usage summary -> test

In [2]:
data = {
    'train': 'data/final/merged_april_2017.csv',
    'test': 'data/chase_may_7899.CSV'
}
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
train = pd.read_csv(data['train'], parse_dates=['Trans Date', 'Post Date'], date_parser=dateparse)
train.head(5)

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,2016-12-31,2017-01-01,PAYPAL *DIGIZONE,-2.17,Goods
1,Sale,2016-12-31,2017-01-01,SQ *COOL TEA BAR WAVERLY,-8.72,Foods
2,Sale,2016-12-30,2017-01-01,TARGET 00003202,-29.61,Goods
3,Sale,2016-12-30,2017-01-01,99 RANCH #1769,-47.59,Groceries
4,Sale,2016-12-30,2017-01-01,A&amp;A GAS,-23.62,Auto


In [3]:
test = pd.read_csv(data['test'])
test.head(5)

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount
0,Sale,05/31/2017,06/02/2017,SAFEWAY STORE 00026062,-5.04
1,Sale,05/30/2017,05/31/2017,SICHUAN CHONG QING CUISIN,-30.4
2,Sale,05/30/2017,05/31/2017,ADIDAS ONLINE STORE,-43.4
3,Sale,05/29/2017,05/30/2017,Amazon.com,-57.42
4,Sale,05/28/2017,05/29/2017,ADIDAS ONLINE STORE,-43.5


## Preprocessing

- Sum of categories have very few occurance
- Those are better to be dropped for performance

In [4]:
gb = train.groupby('Kind')
gb = gb.count()['Type']

drop_index = gb[gb <= 5].index.tolist()
drop_index

['Internet']

In [5]:
train = train[~train.Kind.isin(drop_index)]

In [6]:
train.head()

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,2016-12-31,2017-01-01,PAYPAL *DIGIZONE,-2.17,Goods
1,Sale,2016-12-31,2017-01-01,SQ *COOL TEA BAR WAVERLY,-8.72,Foods
2,Sale,2016-12-30,2017-01-01,TARGET 00003202,-29.61,Goods
3,Sale,2016-12-30,2017-01-01,99 RANCH #1769,-47.59,Groceries
4,Sale,2016-12-30,2017-01-01,A&amp;A GAS,-23.62,Auto


## Define helper methods

- text process and vectorize description
- one-hot encode dataframe

In [7]:
def preprocess_X(X:pd.DataFrame, processors=dict(), train=True):
    if not train and not processors:
        raise Exception("Processor should be given when train = False")
        
    def keep_only_ABCs_and_lower(text):
        except_letters = re.compile(r'(?![A-z]).')
        return except_letters.sub(" ", text.lower())

    def convert_to_single_space(text):
        multiple_whitespace = re.compile(r'\s+')
        assert isinstance(text, str), text
        return multiple_whitespace.sub(" ", text)

    def keep_only_length_greater_than_2(text):
        return " ".join(word for word in text.split(' ') if len(word) > 2)

    def text_process(text):
        text = keep_only_ABCs_and_lower(text)
        text = keep_only_length_greater_than_2(text)
        text = convert_to_single_space(text)
        return text
    
    def label_onehot_encode(data: pd.Series,
                            label_encoder:preprocessing.LabelEncoder,
                            onehot_encoder:preprocessing.OneHotEncoder,
                            train=True,
                           ) -> np.ndarray:
                
        if train:
            
            label_encoded = label_encoder.fit_transform(data).reshape(-1, 1)
            onehot_encoded = onehot_encoder.fit_transform(label_encoded)
            
            return onehot_encoded

        else:

            label_encoded = label_encoder.transform(data).reshape(-1, 1)
            onehot_encoded = onehot_encoder.transform(label_encoded)

            return onehot_encoded

        
    scaler = processors.get('Scaler', preprocessing.StandardScaler())
    vectorizer = processors.get('Vectorizer', CountVectorizer())
    text_processor = processors.get('TextProcessor', text_process)
    
    label_encoder = processors.get("LabelEncoder", preprocessing.LabelEncoder())
    onehot_encoder = processors.get("OneHotEncoder", preprocessing.OneHotEncoder(sparse=False))
    
    features = ['Type', 'Description', 'Amount']
    assert all(feat in X.columns for feat in features), "X does not have correct headers"
    X_features = X[features]

    X_TYPE_ONE_HOT = label_onehot_encode(X_features['Type'], label_encoder, onehot_encoder, train)
    X_DESCRIPTION = X_features['Description'].apply(text_processor)
    
    if train:
        X_DESCRIPTION = vectorizer.fit_transform(X_DESCRIPTION).toarray()
    else:
        X_DESCRIPTION = vectorizer.transform(X_DESCRIPTION).toarray()
    
    X_AMOUNT = X_features['Amount'].values.reshape(-1, 1)
    X_processed = np.concatenate((X_TYPE_ONE_HOT, X_DESCRIPTION, X_AMOUNT), axis=1)
    
    if train:
        X_processed = scaler.fit_transform(X_processed)
    else:
        X_processed = scaler.transform(X_processed)
    
    processors = {
        'Scaler': scaler,
        'Vectorizer': vectorizer,
        'TextProcessor': text_processor,
        "LabelEncoder": label_encoder,
        "OneHotEncoder": onehot_encoder,
    }
    
    return X_processed, processors

def preprocess_Y(X:pd.DataFrame, name='Kind') -> (np.array, pd.Index):
    """ Encode str label to int
    """
    return pd.factorize(X[name])

In [8]:
X, processors = preprocess_X(train)
y, y_label = preprocess_Y(train)
X.shape, y.shape

((538, 301), (538,))

## Train/Test Split

- There is no validation set
- Instead, I'm going to use a K-Fold Cross Validation

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((430, 301), (430,), (108, 301), (108,))

## Modeling with Sklearn

- `RandomizedSearchCV`: to search the best hyperparameters
- I know test score is not great but it's good enough for me now
- train takes around 10 minutes

In [11]:
parameters = {'max_depth': [None, 3, 5, 10],
              'min_samples_split': [5, 10, 50, 100],
              'n_estimators': [100, 500, 50], }
tree = RandomForestClassifier()
clf = GridSearchCV(tree, param_grid=parameters, n_jobs=8)

In [12]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'max_depth': [None, 3, 5, 10], 'min_samples_split': [5, 10, 50, 100], 'n_estimators': [100, 500, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
clf.best_params_

{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 500}

In [14]:
clf.score(X_test, y_test)

0.80555555555555558

## Predict labels for test data

In [15]:
X_new, _ = preprocess_X(pd.read_csv(data['test']), processors, train=False)
y_new = clf.predict(X_new)
df = pd.read_csv(data['test'])
df['Kind'] = [y_label[y] for y in y_new]
df.head(10)

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,05/31/2017,06/02/2017,SAFEWAY STORE 00026062,-5.04,Groceries
1,Sale,05/30/2017,05/31/2017,SICHUAN CHONG QING CUISIN,-30.4,Foods
2,Sale,05/30/2017,05/31/2017,ADIDAS ONLINE STORE,-43.4,Goods
3,Sale,05/29/2017,05/30/2017,Amazon.com,-57.42,Goods
4,Sale,05/28/2017,05/29/2017,ADIDAS ONLINE STORE,-43.5,Goods
5,Sale,05/28/2017,05/30/2017,STARBUCKS STORE 06677,-3.45,Coffee
6,Sale,05/27/2017,05/29/2017,KUKJE SUPER MARKET,-53.03,Groceries
7,Sale,05/26/2017,05/28/2017,GEICO *AUTO,-68.2,Auto
8,Sale,05/25/2017,05/26/2017,UDACITY INC.,-800.0,Study
9,Sale,05/24/2017,05/26/2017,CCSF MTA PARKING METER P,-11.25,Auto


## Post processing

- Save to csv
- Fix incorrect labels manually using spreadsheet tools

In [16]:
assert 1==0 # Stop here: below it's a merge step

AssertionError: 

In [17]:
df.to_csv('data/data_processed.csv', index=False)

# 9. Merge step
- create a final processed ready-for-analysis file

In [18]:
dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%Y')
df = pd.read_csv("data/data_processed.csv", parse_dates=['Trans Date', 'Post Date'], date_parser=dateparse)
df.head()

Unnamed: 0,Type,Trans Date,Post Date,Description,Amount,Kind
0,Sale,2017-05-31,2017-06-02,SAFEWAY STORE 00026062,-5.04,Groceries
1,Sale,2017-05-30,2017-05-31,SICHUAN CHONG QING CUISIN,-30.4,Foods
2,Sale,2017-05-30,2017-05-31,ADIDAS ONLINE STORE,-43.4,Goods
3,Sale,2017-05-29,2017-05-30,Amazon.com,-57.42,Goods
4,Sale,2017-05-28,2017-05-29,ADIDAS ONLINE STORE,-43.5,Goods


In [19]:
col_names = ['Type', 'Trans Date', 'Post Date', 'Description', 'Amount', 'Kind']

In [20]:
merged = train[col_names].append(df, ignore_index=True)

In [21]:
merged.to_csv("data/final/merged_may_2017.csv", index=False)