In [290]:
import os
import time
from collections import Counter
import logging
import argparse

# helper modules
from ml_utils import pickle_classifier, load_classifier, ExtractFeature, \
                     precision_recall_matrix, get_classifier_results
from nlp_helper import CleanTextTransformer, tokenize_text
from query_events import execute_query

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from scipy.sparse import coo_matrix, hstack

In [291]:
logname = 'log/event_classifier_log'
logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s -  %(name)s - %(levelname)s - %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)
logger = logging.getLogger(__name__)


parser = argparse.ArgumentParser(description='This is the event classifier program.')
parser.add_argument('--level', help='Level of classification, \
        usage: --level primary', choices=["primary", "secondary", "tertiary"], required=True)
parser.add_argument('--retrain', help='Retrain classifier. usage: --retrain F',
                    choices=['T', 'F'], required=True)
parser.add_argument('--load_clf', help='Load existing classifier. \
        usage: --load_clf classifiers/SVM_06202017121413.pkl', nargs='+', required=False)
parser.add_argument('--event_ids', help='Enter events to classify. If blank \
        then query will fetch all training data. usage: --event_ids 998746 \
        33384956 114992', nargs='+', required=False)
#filepath = os.path.dirname(__file__)
CLASSIFIER_DIR = os.path.join('/Users/iman/code/event_classifier/model/pickles/classifiers/')
FEATURE_DIR = os.path.join('/Users/iman/code/event_classifier/model/pickles/feature_pipelines/')

# Getting event text for classifier
QUERY_ALL = """

SELECT event_id
, p_class
, s_class
, t_class
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT ce.event_id
    , CASE WHEN ce.p_class = 'skip' THEN 'other' ELSE ce.p_class END
    , CASE WHEN ce.s_class = 'skip' THEN 'other' ELSE ce.s_class END
    , CASE WHEN ce.t_class = 'skip' THEN 'other' ELSE ce.t_class END
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , ce.created_at as created
    FROM event_training_selections ce
    JOIN events e ON e.id = ce.event_id
    JOIN cards c ON c.event_id = ce.event_id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE ce.is_confirmed
    GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 10
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""

QUERY_EVENTS = """

SELECT event_id
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT e.id as event_id
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , e.created_at as created
    FROM events e
    JOIN cards c ON c.event_id = e.id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE e.id LIKE {0}
    GROUP BY 1, 2, 3, 4, 5, 7
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""


In [30]:
def most_common(lst):
    """
    Using Counter find most common element in list
    Possible results:
        mc = [(val1,3)]
        mc = [(val1,2),(val2,1)]
        mc = [(val1,1),(val2,1),(val3,1)]
        ...
    No majority exists only when there is a tie, otherwise the first value of
    the list is the most common (because Counter sorts automatically in
    descending order by value)
    """

    # check if top two most common predictions are the same
    mc = Counter(lst).most_common(2)

    if len(mc) > 1 and mc[0][1] == mc[1][1]:
        return "no_majority"

    return mc[0][0]


def get_ensemble_prediction(results, classes):
    """
    if majority of classifiers choose same category, that's the winner.
    if majority does not exist, then select class with highest probability
    """

    print("Getting ensemble predictions...")
    num_clfs = len(results)

    # combine all classifier predictions and probabilities
    all_preds = np.array([v[0] for k, v in results.items()]).T
    all_probs = np.sum(np.array([v[1] for k, v in results.items()]), axis=0)
    all_probs_normalize = all_probs/num_clfs

    # the function most_common returns majority class or "no_majority"
    majority = np.array(list(map(most_common, all_preds)))
    no_majority_index = np.where(majority == 'no_majority')

    # for those where a majority doesn't exist, sum the probabilities for each
    # class
    no_majority_sum = all_probs_normalize[no_majority_index]

    # ensure no new probabilities were added that shouldn't be
    assert np.allclose(np.sum(no_majority_sum), len(no_majority_index[0])), (
           "probability sum is greater than expected for no_majority")

    # replace the "no_majority" samples with the class that resulted in the
    # largest probability
    majority[no_majority_index] = classes[np.argmax(no_majority_sum, axis=1)]

    return majority, np.max(all_probs_normalize, axis=1)

In [29]:
def load_classifier_list(clf):
    """
    Load a classifier
    Classifier is stored as list object
    Returns list of classifiers and their names
    """

    print("Loading classifier list...")
    # clean up input: remove "classifier/" and ".pkl"
    clf_id = clf[clf.find('/')+1:-4]

    clf_list, clf_names = [], []
    # this is a list of classifiers
    loaded_clf = load_classifier(CLASSIFIER_DIR + clf_id)
    for classifier in loaded_clf:
        clf_list.append(classifier)

        # get name via class structure
        clf_class = str(classifier.named_steps['clf'].__class__)

        # some basic cleaning of class name
        clf_name_indx = clf_class.find('.')
        clf_name = clf_class[clf_name_indx+1:-2]
        clf_names.append(clf_name)

    return clf_list, clf_names

In [27]:
def save_tertiary_features(feature_pipeline, level, id_num):
    """
    Save training features
    Need to create an id_num to know which classifier to load
    Loaded classifier must be the same as the feature id number
    """

    logger.info("Saving tertiary fitted feature pipeline id {}...".format(id_num))
    pickle_classifier(feature_pipeline, FEATURE_DIR + level + "_" + str(id_num))

    return id_num

In [28]:
def save_classifier(clf, level, id_num):
    """
    Save trained classifier
    """

    logger.info("Saving trained classifier id {}...".format(id_num))
    pickle_classifier(clf, CLASSIFIER_DIR + level + str(id_num))


In [26]:
# parameters for gridsearch
# using SVM currently
CLASSIFIER_PIPELINE = dict({
    'SVM': {
        'classifier': Pipeline([
            ("clf", SVC(probability=True)),
        ]),
        'params': {
            'clf__C': [1],
            'clf__kernel': ['linear']
        }
     },
})

In [69]:
def check_tertiary_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix, predicted_secondary_matrix):
    
    assert (X.shape[1] == subject_matrix.shape[1] + text_matrix.shape[1] + \
            event_type_matrix.shape[1] + predicted_primary_matrix.shape[1] + predicted_secondary_matrix.shape[1]), \
    ("Number of x_train features doesn't match sum of component features")
    
    assert (X.shape[0] == subject_matrix.shape[0]),\
    ("Number of x_train samples doesn't match subject_matrix samples")

    assert (X.shape[0] == text_matrix.shape[0]), \
    ("Number of x_train samples doesn't match body text samples")
    
    assert (X.shape[0] == event_type_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")

    assert (X.shape[0] == predicted_primary_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")

    assert (X.shape[0] == predicted_secondary_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")
    

In [23]:
def grid_search(X, y, gridsearch_pipeline):
    """
    Perform a Grid Search over the space of classifiers and their associated
    parameter space
    Inputs: X and y training sets
    Output: A list of the best classifiers from each classifier category
    """

    logger.info("starting Gridsearch...")

    best_classifiers = []
    names = []

    for v in gridsearch_pipeline.items():
        gs = GridSearchCV(v[1]['classifier'], v[1]['params'], verbose=2, cv=3, n_jobs=4)
        gs = gs.fit(X, y)
        names.append(v[0])
        logger.info("{} finished".format(v[0]))
        logger.info("Best scoring classifier: {}".format(gs.best_score_))
        best_classifiers.append(gs.best_estimator_)

    return best_classifiers, names

In [262]:
FEATURE_PIPELINE = dict({
    'subject_pipe':Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'text_pipe': Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'event_type_pipe': Pipeline([('vectorizer', CountVectorizer())]),
    'predicted_primary': Pipeline([('vectorizer', CountVectorizer())]),
    'predicted_secondary': Pipeline([('vectorizer', CountVectorizer())])
})

In [263]:
def get_tertiary_training_features(X, y, pipes):
    """
    Combines training features
    Using fit_transform on the pipeline object for each feature
    Performs sparse matrix concatenation
    """
    
    print("Beginning pipeline fit_transform to training data...")
    
    subject_matrix = pipes['subject_pipe'].fit_transform(X.subject, y)
    text_matrix = pipes['text_pipe'].fit_transform(X.text, y)
    event_type_matrix = pipes['event_type_pipe'].fit_transform(X.event_type, y)
    predicted_primary_matrix = pipes['predicted_primary'].fit_transform(X.final_primary, y)
    predicted_secondary_matrix = pipes['predicted_secondary'].fit_transform(X.final_secondary, y)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix, predicted_secondary_matrix])
    
    print("Completed fit_transform")
    
    check_tertiary_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix, predicted_secondary_matrix)
    print("Training set dimension:", X.shape)
    
    return X, pipes

In [264]:
def get_tertiary_testing_features(X, pipes):
    """
    Combines training features
    Key difference between this and training_features
    is that pipeline is transforming x_test not fit_transforming
    """
    
    print("Beginning transform of test set...")
    subject_matrix = pipes['subject_pipe'].transform(X.subject)
    text_matrix = pipes['text_pipe'].transform(X.text)
    event_type_matrix = pipes['event_type_pipe'].transform(X.event_type)
    predicted_primary_matrix = pipes['predicted_primary'].transform(X.final_primary)
    predicted_secondary_matrix = pipes['predicted_secondary'].transform(X.final_secondary)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix, predicted_secondary_matrix])
    
    print("Completed transform of test set.")
    
    check_tertiary_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix, predicted_secondary_matrix)
    print("Testing set dimension:", X.shape)
    
    return X

In [265]:
def _check_prediction_dimensions(y_test, y_pred, y_score):
    """
    Ensure that prediction dimensions are correct
    """

    assert y_pred.shape[0] == y_test.shape[0], (
        "Ensure class prediction vector is same length as test set")
    assert y_score.shape[0] == y_test.shape[0], (
        "Ensure score prediction vector is same length as test set")


In [266]:
import os
import time
from collections import Counter
import logging
import argparse

# helper modules
from ml_utils import pickle_classifier, load_classifier, ExtractFeature, \
                     precision_recall_matrix, get_classifier_results
from nlp_helper import CleanTextTransformer, tokenize_text
from query_events import execute_query

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from scipy.sparse import coo_matrix, hstack

In [303]:
# load dataset
df = pd.read_pickle('pickles/secondary_output/dataset_367.pkl')

In [304]:
df.final_primary.value_counts()

personal                630
birthday_celebration    581
greetings               520
organizations           272
wedding_related         215
other                     4
Name: final_primary, dtype: int64

In [355]:
y = df['true_tertiary_class']
X = df[['subject','text','event_type','final_primary','final_secondary','pred_primary_score','pred_secondary_score']]

In [310]:
id_num = 367

In [356]:
# split training, testing
x_train, x_test, y_train, y_test, idx1, idx2 = train_test_split(X, y, X.index, test_size=0.4)

In [357]:
x_train_matrix, feature_pipeline = get_tertiary_training_features(x_train, y_train, FEATURE_PIPELINE)
x_test_matrix = get_tertiary_testing_features(x_test, feature_pipeline)

Beginning pipeline fit_transform to training data...
Completed fit_transform
Training set dimension: (1333, 8517)
Beginning transform of test set...
Completed transform of test set.
Testing set dimension: (889, 8517)


In [359]:
y_train.head()

6370    little_kids_2_7_birthday_party
4315    adult_milestone_birthday_party
517                 reception_or_party
9402                 wedding_thank_you
6041                  engagement_party
Name: true_tertiary_class, dtype: object

In [316]:
save_tertiary_features(feature_pipeline, 'tertiary', id_num)

Saving classifier: /Users/iman/code/event_classifier/model/pickles/feature_pipelines/tertiary_367...


367

In [360]:
clf_list, clf_names = grid_search(x_train_matrix, y_train, CLASSIFIER_PIPELINE)

Fitting 3 folds for each of 1 candidates, totalling 3 fits




[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] ..................... clf__C=1, clf__kernel=linear, total=   2.6s
[CV] ..................... clf__C=1, clf__kernel=linear, total=   3.0s
[CV] ..................... clf__C=1, clf__kernel=linear, total=   3.0s


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    3.4s finished


In [361]:
results, classes = get_classifier_results(clf_list, clf_names, x_test_matrix, y_test)

Getting classifier results...


In [129]:
#save_classifier(clf_list, 'tertiary', id_num)

Saving classifier: /Users/iman/code/event_classifier/model/pickles/classifiers/secondary_classifier_674...


In [362]:
y_pred, y_score = get_ensemble_prediction(results, classes)
_check_prediction_dimensions(y_test, y_pred, y_score)

Getting ensemble predictions...


In [363]:
def _decorate_with_tertiary(X, y_test, y_pred, y_score):
    """
    Adds two columns to x_test
        predicted primary class
        predicted primary score
    Used as input to secondary classifier
    """

    # uses output of primary model
    X['true_tertiary_class'] = y_test
    X['pred_tertiary_class'] = y_pred
    X['pred_tertiary_score'] = y_score

    return X

In [364]:
_decorate_with_tertiary(x_test, y_test, y_pred, y_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,subject,text,event_type,final_primary,final_secondary,pred_primary_score,pred_secondary_score,true_tertiary_class,pred_tertiary_class,pred_tertiary_score
264,Hillman Scholars Holiday Break Dinner Invite,jennifer & tom hillman\r invite you\rto an eve...,RsvpEvent,personal,seasonal_holiday_events,0.526091,0.266389,general_party_food_and_drinks,holiday_party,0.202715
11770,Steel & Lacquer Grand Opening,,RsvpEvent,organizations,business_or_nonprofits,0.720366,0.712480,grand_opening_or_launch,reception_or_party,0.189758
4204,Carolyn Kim's Baby Sprinkle,please join us\rfor a baby sprinkle\rin honor ...,RsvpEvent,personal,celebrating_baby_kids_or_parents_to_be,0.875800,0.711040,baby_shower_or_new_baby_related_event,baby_shower_or_new_baby_related_event,0.587541
1257,Andie's Tea for Two Birthday Party,tea for two\randie is turning two \r&\rwe’re h...,RsvpEvent,birthday_celebration,kids_birthday,0.942905,0.710637,little_kids_2_7_birthday_party,little_kids_2_7_birthday_party,0.627751
9388,Beth & Devon Berry Moving Celebration,save the date\rfor an intimate gathering \r ...,RsvpEvent,organizations,business_or_nonprofits,0.333730,0.687706,farewell_or_moving,reception_or_party,0.295725
3204,Elisha's Graduation Celebration,mr. and mrs. john tran\rinvite you to join the...,RsvpEvent,personal,adult_or_family_events,0.898763,0.708765,graduation,graduation,0.089326
5937,Summer Party,summer party!\rlet's get together for no speci...,RsvpEvent,personal,adult_or_family_events,0.619851,0.790516,general_party_food_and_drinks,general_party_food_and_drinks,0.324084
12020,Annual Fund Kick-Off Dinner - Chairs,"please join us\ron\rwednesday, november 11th\r...",RsvpEvent,organizations,business_or_nonprofits,0.744873,0.688900,fundraiser_or_charity_event,reception_or_party,0.368597
608,Linnea's Graduation Luncheon,"this fall, linnea will be attending smith coll...",RsvpEvent,personal,adult_or_family_events,0.711789,0.730994,graduation,graduation,0.130704
5648,Dinner party invite,john & shara\rinvite you to\r\rcocktails & din...,RsvpEvent,personal,adult_or_family_events,0.616328,0.600318,general_party_food_and_drinks,general_party_food_and_drinks,0.302011


In [365]:
x_test

Unnamed: 0,subject,text,event_type,final_primary,final_secondary,pred_primary_score,pred_secondary_score,true_tertiary_class,pred_tertiary_class,pred_tertiary_score
264,Hillman Scholars Holiday Break Dinner Invite,jennifer & tom hillman\r invite you\rto an eve...,RsvpEvent,personal,seasonal_holiday_events,0.526091,0.266389,general_party_food_and_drinks,holiday_party,0.202715
11770,Steel & Lacquer Grand Opening,,RsvpEvent,organizations,business_or_nonprofits,0.720366,0.712480,grand_opening_or_launch,reception_or_party,0.189758
4204,Carolyn Kim's Baby Sprinkle,please join us\rfor a baby sprinkle\rin honor ...,RsvpEvent,personal,celebrating_baby_kids_or_parents_to_be,0.875800,0.711040,baby_shower_or_new_baby_related_event,baby_shower_or_new_baby_related_event,0.587541
1257,Andie's Tea for Two Birthday Party,tea for two\randie is turning two \r&\rwe’re h...,RsvpEvent,birthday_celebration,kids_birthday,0.942905,0.710637,little_kids_2_7_birthday_party,little_kids_2_7_birthday_party,0.627751
9388,Beth & Devon Berry Moving Celebration,save the date\rfor an intimate gathering \r ...,RsvpEvent,organizations,business_or_nonprofits,0.333730,0.687706,farewell_or_moving,reception_or_party,0.295725
3204,Elisha's Graduation Celebration,mr. and mrs. john tran\rinvite you to join the...,RsvpEvent,personal,adult_or_family_events,0.898763,0.708765,graduation,graduation,0.089326
5937,Summer Party,summer party!\rlet's get together for no speci...,RsvpEvent,personal,adult_or_family_events,0.619851,0.790516,general_party_food_and_drinks,general_party_food_and_drinks,0.324084
12020,Annual Fund Kick-Off Dinner - Chairs,"please join us\ron\rwednesday, november 11th\r...",RsvpEvent,organizations,business_or_nonprofits,0.744873,0.688900,fundraiser_or_charity_event,reception_or_party,0.368597
608,Linnea's Graduation Luncheon,"this fall, linnea will be attending smith coll...",RsvpEvent,personal,adult_or_family_events,0.711789,0.730994,graduation,graduation,0.130704
5648,Dinner party invite,john & shara\rinvite you to\r\rcocktails & din...,RsvpEvent,personal,adult_or_family_events,0.616328,0.600318,general_party_food_and_drinks,general_party_food_and_drinks,0.302011


In [332]:
hierarchy = pd.DataFrame.from_csv('pickles/hierarchy.csv')

In [333]:
hierarchy.head()

Unnamed: 0,p_class,s_class,t_class
0,birthday_celebration,kids_birthday,tween_8_12_birthday_party
1,birthday_celebration,adult_birthday,other_adult_birthday_party
2,organizations,business_or_nonprofits,reception_or_party
3,personal,adult_or_family_events,retirement
4,personal,adult_or_family_events,other_general_party


In [334]:
x_test.head(50)

Unnamed: 0,subject,text,event_type,final_primary,final_secondary,pred_primary_score,pred_secondary_score,pred_tertiary_class,pred_tertiary_score
11133,Wendy's Birthday Drinks,an invitation to\r\rwendy's birthday\r\rwednes...,RsvpEvent,birthday_celebration,adult_birthday,0.876633,0.666389,other_adult_birthday_party,0.441714
1953,2015 BPS Holiday Party,"since we all work so hard all year, \rlet's ge...",RsvpEvent,personal,seasonal_holiday_events,0.513731,0.639787,holiday,0.461749
1658,Inaugural Kentucky Derby Party,join me for a day at the races!\r\rmint juleps...,RsvpEvent,personal,adult_or_family_events,0.740655,0.599806,sports_or_viewing_party,0.104092
1831,Mason's First Birthday,join us for mason's\rvery first birthday!\rsun...,RsvpEvent,birthday_celebration,kids_birthday,0.95663,0.693456,little_kids_2_7_birthday_party,0.634613
8718,Global Entry Appointment Day for STC clients,global entry appointments\rfor sanders travel ...,RsvpEvent,organizations,business_or_nonprofits,0.522908,0.699406,reception_or_party,0.273134
2356,CLS Summer Happy Hour,community legal services\rinvites you to \r\rs...,RsvpEvent,organizations,business_or_nonprofits,0.594089,0.593582,reception_or_party,0.265692
5134,Wine Tasting with Integrity Wines,“the\rbest wines\rare the ones\rwe drink with\...,RsvpEvent,personal,adult_or_family_events,0.473305,0.68311,general_party_food_and_drinks,0.349371
7520,Heather Colman's Birthday Party,heather is 60\r\rlet's celebrate\r\r5pm 19th s...,RsvpEvent,birthday_celebration,kids_birthday,0.909117,0.482692,little_kids_2_7_birthday_party,0.211799
8531,Lucy's Dirty Do,upload,RsvpEvent,personal,adult_or_family_events,0.304685,0.308988,general_party_food_and_drinks,0.216216
4131,Ryan and Kenzie's Second Annual Derby Party,join us for a day at the races!\r\rmint juleps...,RsvpEvent,personal,adult_or_family_events,0.748257,0.394268,sports_or_viewing_party,0.143368


In [343]:
def _enforce_tertiary_hierarchy(row):
    
    primary, secondary, s_score, tertiary, t_score = row['final_primary'], row['final_secondary'], row['pred_secondary_score'], row['pred_tertiary_class'], row['pred_tertiary_score']
    print(secondary)
    expected_secondary = np.unique(hierarchy[hierarchy.t_class == tertiary].s_class)
            
    if secondary in expected_secondary:
        "tertiary hierarchy enforced"
        return secondary, tertiary

    else:
        
        if s_score > t_score:  # primary wins
            return secondary, 'unknown'
        else:
            if len(expected_secondary) == 2:
                return 'unknown', tertiary
            else:
                return expected_secondary[0], tertiary

In [344]:
def add_final_classes(X):
    """
    Enforce the hierarchy and add final classification
    """

    print("Adding final classifications to dataframe...")
    finals = X.apply(_enforce_tertiary_hierarchy, axis=1)
    finals = np.array([np.array([i[0], i[1]]) for i in finals])

    X.loc[:, 'final_secondary'] = finals[:, 0]
    X.loc[:, 'final_tertiary'] = finals[:, 1]

    return X

In [345]:
final_x = add_final_classes(x_test)

Adding final classifications to dataframe...
adult_birthday
seasonal_holiday_events
adult_or_family_events
kids_birthday
business_or_nonprofits
business_or_nonprofits
adult_or_family_events
kids_birthday
adult_or_family_events
adult_or_family_events
adult_birthday
everyday_greetings
everyday_greetings
parties_and_showers_in_honor_of_the_bride_&_groom
kids_birthday
business_or_nonprofits
everyday_greetings
seasonal_holiday_events
everyday_greetings
everyday_greetings
kids_birthday
kids_birthday
celebrating_baby_kids_or_parents_to_be
kids_birthday
business_or_nonprofits
everyday_greetings
business_or_nonprofits
adult_or_family_events
wedding_events_hosted_by_bride_groom_or_family
adult_or_family_events
celebrating_baby_kids_or_parents_to_be
kids_birthday
business_or_nonprofits
business_or_nonprofits
kids_birthday
parties_and_showers_in_honor_of_the_bride_&_groom
celebrating_baby_kids_or_parents_to_be
business_or_nonprofits
adult_birthday
parties_and_showers_in_honor_of_the_bride_&_groom


kids_birthday
business_or_nonprofits
kids_birthday
['adult_birthday']
adult_birthday
kids_birthday
adult_or_family_events
adult_or_family_events
kids_birthday
adult_birthday
adult_or_family_events
everyday_greetings
celebrating_baby_kids_or_parents_to_be
seasonal_holiday_cards
adult_birthday
adult_or_family_events
seasonal_holiday_cards
kids_birthday
kids_birthday
celebrating_baby_kids_or_parents_to_be
adult_or_family_events
parties_and_showers_in_honor_of_the_bride_&_groom
other_holiday_events
everyday_greetings
everyday_greetings
other_holiday_cards
kids_birthday
kids_birthday
adult_or_family_events
everyday_greetings
everyday_greetings
other_holiday_events
everyday_greetings
everyday_greetings
wedding_events_hosted_by_bride_groom_or_family
kids_birthday
parties_and_showers_in_honor_of_the_bride_&_groom
seasonal_holiday_cards
adult_or_family_events
everyday_greetings
everyday_greetings
adult_or_family_events
adult_or_family_events
seasonal_holiday_events
kids_birthday
everyday_greeti

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [354]:
precision_recall_matrix(y_test, y_pred, classes)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Unnamed: 0,classification,precision,recall
0,1st_birthday_party,0.941176,0.551724
1,4th_of_july,0.000000,0.000000
2,address_collection,0.000000,0.000000
3,adult_milestone_birthday_party,0.815789,0.704545
4,anniversary,0.000000,0.000000
5,apology,0.000000,0.000000
6,baby_shower_or_new_baby_related_event,0.944444,0.894737
7,bachelor_or_stag_party,0.000000,0.000000
8,bachelorette_or_hen_party,1.000000,0.666667
9,bar_or_bat_mitzvah,1.000000,0.750000
