In [481]:
import os
import time
from collections import Counter
import logging
import argparse

# helper modules
from ml_utils import pickle_classifier, load_classifier, ExtractFeature, \
                     precision_recall_matrix, get_classifier_results
from nlp_helper import CleanTextTransformer, tokenize_text
from query_events import execute_query

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from scipy.sparse import coo_matrix, hstack

In [477]:
logname = 'log/event_classifier_log'
logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s -  %(name)s - %(levelname)s - %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)
logger = logging.getLogger(__name__)


parser = argparse.ArgumentParser(description='This is the event classifier program.')
parser.add_argument('--level', help='Level of classification, \
        usage: --level primary', choices=["primary", "secondary", "tertiary"], required=True)
parser.add_argument('--retrain', help='Retrain classifier. usage: --retrain F',
                    choices=['T', 'F'], required=True)
parser.add_argument('--load_clf', help='Load existing classifier. \
        usage: --load_clf classifiers/SVM_06202017121413.pkl', nargs='+', required=False)
parser.add_argument('--event_ids', help='Enter events to classify. If blank \
        then query will fetch all training data. usage: --event_ids 998746 \
        33384956 114992', nargs='+', required=False)
#filepath = os.path.dirname(__file__)
CLASSIFIER_DIR = os.path.join('/Users/iman/code/event_classifier/model/classifiers/')

# Getting event text for classifier
QUERY_ALL = """

SELECT event_id
, p_class
, s_class
, t_class
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT ce.event_id
    , CASE WHEN ce.p_class = 'skip' THEN 'other' ELSE ce.p_class END
    , CASE WHEN ce.s_class = 'skip' THEN 'other' ELSE ce.s_class END
    , CASE WHEN ce.t_class = 'skip' THEN 'other' ELSE ce.t_class END
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , ce.created_at as created
    FROM event_training_selections ce
    JOIN events e ON e.id = ce.event_id
    JOIN cards c ON c.event_id = ce.event_id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE ce.is_confirmed
    GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 10
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""

QUERY_EVENTS = """

SELECT event_id
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT e.id as event_id
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , e.created_at as created
    FROM events e
    JOIN cards c ON c.event_id = e.id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE e.id LIKE {0}
    GROUP BY 1, 2, 3, 4, 5, 7
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""


In [569]:
def most_common(lst):
    """
    Using Counter find most common element in list
    Possible results:
        mc = [(val1,3)]
        mc = [(val1,2),(val2,1)]
        mc = [(val1,1),(val2,1),(val3,1)]
        ...
    No majority exists only when there is a tie, otherwise the first value of
    the list is the most common (because Counter sorts automatically in
    descending order by value)
    """

    # check if top two most common predictions are the same
    mc = Counter(lst).most_common(2)

    if len(mc) > 1 and mc[0][1] == mc[1][1]:
        return "no_majority"

    return mc[0][0]


def get_ensemble_prediction(results, classes):
    """
    if majority of classifiers choose same category, that's the winner.
    if majority does not exist, then select class with highest probability
    """

    print("Getting ensemble predictions...")
    num_clfs = len(results)

    # combine all classifier predictions and probabilities
    all_preds = np.array([v[0] for k, v in results.items()]).T
    all_probs = np.sum(np.array([v[1] for k, v in results.items()]), axis=0)
    all_probs_normalize = all_probs/num_clfs

    # the function most_common returns majority class or "no_majority"
    majority = np.array(list(map(most_common, all_preds)))
    no_majority_index = np.where(majority == 'no_majority')

    # for those where a majority doesn't exist, sum the probabilities for each
    # class
    no_majority_sum = all_probs_normalize[no_majority_index]

    # ensure no new probabilities were added that shouldn't be
    assert np.allclose(np.sum(no_majority_sum), len(no_majority_index[0])), (
           "probability sum is greater than expected for no_majority")

    # replace the "no_majority" samples with the class that resulted in the
    # largest probability
    majority[no_majority_index] = classes[np.argmax(no_majority_sum, axis=1)]

    return majority, np.max(all_probs_normalize, axis=1)

In [11]:
def load_classifier_list(clf):
    """
    Load a classifier
    Classifier is stored as list object
    Returns list of classifiers and their names
    """

    print("Loading classifier list...")
    # clean up input: remove "classifier/" and ".pkl"
    clf_id = clf[clf.find('/')+1:-4]

    clf_list, clf_names = [], []
    # this is a list of classifiers
    loaded_clf = load_classifier(CLASSIFIER_DIR + clf_id)
    for classifier in loaded_clf:
        clf_list.append(classifier)

        # get name via class structure
        clf_class = str(classifier.named_steps['clf'].__class__)

        # some basic cleaning of class name
        clf_name_indx = clf_class.find('.')
        clf_name = clf_class[clf_name_indx+1:-2]
        clf_names.append(clf_name)

    return clf_list, clf_names

In [None]:
def check_event_types(df):
    
    acceptable_event_types = np.array([
        'BasicAnnouncement', 'DatedAnnouncement', 'GreetingCard', 'LinkAway', 'RsvpEvent'])
    
    for i in df.event_type.values:
        assert (i in acceptable_event_types) == True, "Found event type that doesn't exist"

In [None]:
def check_primary_classes(y):
    
    acceptable_primary_classes = np.array([
        'birthday_celebration', 'greetings', 'organizations', 'other',
       'personal', 'wedding_related'])
    
    for i in y.primary.values:
        assert (i in acceptable_primary_classes) == True, "Found event type that doesn't exist"

In [None]:
def check_null(df):
    """
    Ensure no NULL values
    """
    
    assert np.sum(pd.isnull(df).values) == 0, "Some NULL values exist"

In [None]:
def clean_df(df):
    """
    Perform any necessary cleaning
    
    1) Remove other class types
    2) ensure no NULL values
    3) ensure event_types are corrects
    4) ensure primary classes are correct
    
    """
    
    df[~(df.s_class == 'other')].copy()
    
    
    check_null(df)
    check_event_types(df)
    check_primary_classes(y)
    
    return df
    

In [36]:
def get_X_and_y(df):
    """
    Input: dataframe based on query
    Output: X and y (type dataframe)
    """

    df = clean_df(df)
    
    X = pd.DataFrame([df.event_subject, df.event_text, df.event_type]).T
    X.columns = ['subject', 'text', 'event_type']

    y = pd.DataFrame([df.p_class, df.s_class, df.t_class]).T
    y.columns = ['primary', 'secondary', 'tertiary']

    assert X.shape[0] == y.shape[0], 'X and y must be of same dimension'

    
    return X, y

In [548]:
def get_primary_training_features(X, y, FEATURE_PIPELINE):
    """
    Combines training features
    Using fit_transform on the pipeline object for each feature
    Performs sparse matrix concatenation
    """
    
    print("Beginning pipeline fit_transform to training data...")
    
    subject_matrix = FEATURE_PIPELINE['subject_pipe'].fit_transform(X.subject, y)
    text_matrix = FEATURE_PIPELINE['text_pipe'].fit_transform(X.text, y)
    event_type_matrix = FEATURE_PIPELINE['event_type_pipe'].fit_transform(X.event_type, y)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix])
    
    print("Completed fit_transform")
    
    check_dimensions(X, subject_matrix, text_matrix, event_type_matrix)
    print("Training set dimension:", X.shape)
    
    return X, pipes
    

In [566]:
def save_features(feature_pipeline, level):
    """
    Save training features
    Need to create an id_num to know which classifier to load - must be the same as the feature id number
    """
    
    id_num = np.random.randint(1000)
    logger.info("Saving fitted feature pipeline id {}...".format(id_num))
    pickle_classifier(feature_pipeline, CLASSIFIER_DIR + level + "_pipeline_" + str(id_num))


In [568]:
def save_classifier(clf, level, id_num):
    """
    Save trained classifier
    """

    logger.info("Saving trained classifier id {}...".format(id_num))
    pickle_classifier(clf, CLASSIFIER_DIR + level + "_classifier_" + str(id_num))


In [554]:
def get_primary_testing_features(X, FEATURE_PIPELINE):
    """
    Combines training features
    Key difference between this and training_features
    is that pipeline is transforming x_test not fit_transforming
    """
    
    print("Beginning transform of test set...")
    subject_matrix = FEATURE_PIPELINE['subject_pipe'].transform(X.subject)
    text_matrix = FEATURE_PIPELINE['text_pipe'].transform(X.text)
    event_type_matrix = FEATURE_PIPELINE['event_type_pipe'].transform(X.event_type)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix])
    
    print("Completed transform of test set.")
    
    check_dimensions(X, subject_matrix, text_matrix, event_type_matrix)
    print("Testing set dimension:", X.shape)
    
    return X
 

In [541]:
FEATURE_PIPELINE = dict({
    'subject_pipe':Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'text_pipe': Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'event_type_pipe': Pipeline([('vectorizer', CountVectorizer())])
})

In [None]:
# parameters for gridsearch
# using SVM currently
CLASSIFIER_PIPELINE = dict({
    'SVM': {
        'classifier': Pipeline([
            ("clf", SVC(probability=True)),
        ]),
        'params': {
            'clf__C': [1],
            'clf__kernel': ['linear']
        }
     },
})

In [536]:
def check_dimensions(X, *features):
    
    assert (x_train_matrix.shape[1] == subject_matrix.shape[1] + text_matrix.shape[1] + event_type_matrix.shape[1]), \
    ("Number of x_train features doesn't match sum of component features")
    
    assert (x_train_matrix.shape[0] == subject_matrix.shape[0]),\
    ("Number of x_train samples doesn't match subject_matrix samples")

    assert (x_train_matrix.shape[0] == text_matrix.shape[0]), \
    ("Number of x_train samples doesn't match body text samples")
    
    assert (x_train_matrix.shape[0] == event_type_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")
    
    

In [485]:
def grid_search(X, y, gridsearch_pipeline):
    """
    Perform a Grid Search over the space of classifiers and their associated
    parameter space
    Inputs: X and y training sets
    Output: A list of the best classifiers from each classifier category
    """

    logger.info("starting Gridsearch...")

    best_classifiers = []
    names = []

    for v in gridsearch_pipeline.items():
        gs = GridSearchCV(v[1]['classifier'], v[1]['params'], verbose=2, cv=3, n_jobs=4)
        gs = gs.fit(X, y)
        names.append(v[0])
        logger.info("{} finished".format(v[0]))
        logger.info("Best scoring classifier: {}".format(gs.best_score_))
        best_classifiers.append(gs.best_estimator_)

    return best_classifiers, names

In [604]:
df = execute_query(QUERY_ALL, event_id=False)
hierarchy = df[['p_class','s_class','t_class']].drop_duplicates()

In [607]:
hierarchy

Unnamed: 0,p_class,s_class,t_class
0,organizations,business_or_nonprofits,formal_reception_dinner_or_party
1,personal,adult_or_family_events,graduation
2,birthday_celebration,adult_birthday,adult_milestone_birthday_party
3,greetings,seasonal_holiday_cards,holiday
4,greetings,other_holiday_cards,4th_of_july
5,greetings,everyday_greetings,love_or_just_because
6,personal,adult_or_family_events,general_party_food_and_drinks
7,greetings,everyday_greetings,anniversary
8,greetings,everyday_greetings,thank_you
10,organizations,business_or_nonprofits,reception_or_party


In [609]:
y_pred

array(['greetings', 'birthday_celebration', 'greetings', ..., 'greetings',
       'organizations', 'birthday_celebration'],
      dtype='<U20')

In [38]:
#df = execute_query(QUERY_ALL, event_id=False)

X, y = get_X_and_y(df)
print("Size of dataset:", X.shape)

Size of dataset: (46082, 3)


In [83]:
x_train, x_test, y_train, y_test, idx1, idx2 = train_test_split(X, y, X.index, test_size=0.4)

In [579]:
x_train_matrix, feature_pipeline = get_primary_training_features(x_train, y_train, FEATURE_PIPELINE)
x_test_matrix = get_primary_testing_features(x_test, feature_pipeline)
id_num = save_features(feature_pipeline, 'primary')

NameError: name 'feature_pipeline' is not defined

In [573]:
y_train_primary = y_train['primary']

In [574]:
clf_list, clf_names = grid_search(x_train_matrix, y_train_primary, CLASSIFIER_PIPELINE)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=3, clf__kernel=linear ....................................
[CV] ..................... clf__C=1, clf__kernel=linear, total= 4.9min
[CV] clf__C=3, clf__kernel=linear ....................................
[CV] ..................... clf__C=1, clf__kernel=linear, total= 4.9min
[CV] clf__C=3, clf__kernel=linear ....................................
[CV] ..................... clf__C=1, clf__kernel=linear, total= 4.9min


[Parallel(n_jobs=4)]: Done   3 out of   6 | elapsed:  5.6min remaining:  5.6min


[CV] ..................... clf__C=3, clf__kernel=linear, total= 5.1min
[CV] ..................... clf__C=3, clf__kernel=linear, total= 4.3min
[CV] ..................... clf__C=3, clf__kernel=linear, total= 4.3min


[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 10.6min finished


In [575]:
y_test_primary = y_test['primary']

In [577]:
results, classes = get_classifier_results(clf_list, clf_names, x_test_matrix, y_test_primary)

Getting classifier results...


In [580]:
save_classifier(clf_list, 'primary', 193)

Saving classifier: /Users/iman/code/event_classifier/model/classifiers/primary_classifier193...


In [584]:
y_pred, y_score = get_ensemble_prediction(results, classes)
check_prediction_dimensions(y_test_primary, y_pred, y_score)

Getting ensemble predictions...


In [583]:
def check_prediction_dimensions(y_test, y_pred, y_score):
    """
    Ensure that prediction dimensions are correct
    """
    
    assert y_pred.shape[0] ==  y_test.shape[0], ("Ensure class prediction vector is same length as test set")
    assert y_score.shape[0] ==  y_test.shape[0], ("Ensure score prediction vector is same length as test set")


In [585]:
# Calculate the accuracy of the model
logger.info("------------------------------------------------")
logger.info("Overall Accuracy Primary:{}".format(accuracy_score(y_test_primary, y_pred)))
print(precision_recall_matrix(y_test_primary, y_pred, classes))

         classification  precision    recall
0  birthday_celebration   0.978728  0.958557
1             greetings   0.941595  0.974131
2         organizations   0.816908  0.767748
3                 other   0.791822  0.678344
4              personal   0.833678  0.882754
5       wedding_related   0.940847  0.869873
