In [1]:
import os
import time
from collections import Counter
import logging
import argparse

# helper modules
from ml_utils import pickle_classifier, load_classifier, ExtractFeature, \
                     precision_recall_matrix, get_classifier_results
from nlp_helper import CleanTextTransformer, tokenize_text
from query_events import execute_query

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from scipy.sparse import coo_matrix, hstack

In [2]:
logname = 'log/event_classifier_log'
logging.basicConfig(filename=logname,
                    filemode='a',
                    format='%(asctime)s -  %(name)s - %(levelname)s - %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)
logger = logging.getLogger(__name__)


parser = argparse.ArgumentParser(description='This is the event classifier program.')
parser.add_argument('--level', help='Level of classification, \
        usage: --level primary', choices=["primary", "secondary", "tertiary"], required=True)
parser.add_argument('--retrain', help='Retrain classifier. usage: --retrain F',
                    choices=['T', 'F'], required=True)
parser.add_argument('--load_clf', help='Load existing classifier. \
        usage: --load_clf classifiers/SVM_06202017121413.pkl', nargs='+', required=False)
parser.add_argument('--event_ids', help='Enter events to classify. If blank \
        then query will fetch all training data. usage: --event_ids 998746 \
        33384956 114992', nargs='+', required=False)
#filepath = os.path.dirname(__file__)
CLASSIFIER_DIR = os.path.join('/Users/iman/code/event_classifier/model/pickles/classifiers/')

# Getting event text for classifier
QUERY_ALL = """

SELECT event_id
, p_class
, s_class
, t_class
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT ce.event_id
    , CASE WHEN ce.p_class = 'skip' THEN 'other' ELSE ce.p_class END
    , CASE WHEN ce.s_class = 'skip' THEN 'other' ELSE ce.s_class END
    , CASE WHEN ce.t_class = 'skip' THEN 'other' ELSE ce.t_class END
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , ce.created_at as created
    FROM event_training_selections ce
    JOIN events e ON e.id = ce.event_id
    JOIN cards c ON c.event_id = ce.event_id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE ce.is_confirmed
    GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 10
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""

QUERY_EVENTS = """

SELECT event_id
, event_name as event_name
, event_type as event_type
, event_host as event_host
, event_subject as event_subject
, text_paper as event_text
, created
FROM (
    SELECT e.id as event_id
    , e.name as event_name
    , e.type as event_type
    , e.host as event_host
    , e.subject as event_subject
    , listagg(TRIM(lower(cat.text))) as text_paper
    , e.created_at as created
    FROM events e
    JOIN cards c ON c.event_id = e.id
    JOIN card_sides cs ON cs.card_id = c.id
        AND cs.side_type_id = 0
    LEFT JOIN card_assets cat ON cat.card_side_id = cs.id
        AND cat.asset_type_id = 9
    WHERE e.id LIKE {0}
    GROUP BY 1, 2, 3, 4, 5, 7
    )
WHERE len(trim(event_name || ' ' || event_host || ' ' || event_subject ||
        ' ' || text_paper))
ORDER BY random()

"""


In [3]:
def most_common(lst):
    """
    Using Counter find most common element in list
    Possible results:
        mc = [(val1,3)]
        mc = [(val1,2),(val2,1)]
        mc = [(val1,1),(val2,1),(val3,1)]
        ...
    No majority exists only when there is a tie, otherwise the first value of
    the list is the most common (because Counter sorts automatically in
    descending order by value)
    """

    # check if top two most common predictions are the same
    mc = Counter(lst).most_common(2)

    if len(mc) > 1 and mc[0][1] == mc[1][1]:
        return "no_majority"

    return mc[0][0]


def get_ensemble_prediction(results, classes):
    """
    if majority of classifiers choose same category, that's the winner.
    if majority does not exist, then select class with highest probability
    """

    print("Getting ensemble predictions...")
    num_clfs = len(results)

    # combine all classifier predictions and probabilities
    all_preds = np.array([v[0] for k, v in results.items()]).T
    all_probs = np.sum(np.array([v[1] for k, v in results.items()]), axis=0)
    all_probs_normalize = all_probs/num_clfs

    # the function most_common returns majority class or "no_majority"
    majority = np.array(list(map(most_common, all_preds)))
    no_majority_index = np.where(majority == 'no_majority')

    # for those where a majority doesn't exist, sum the probabilities for each
    # class
    no_majority_sum = all_probs_normalize[no_majority_index]

    # ensure no new probabilities were added that shouldn't be
    assert np.allclose(np.sum(no_majority_sum), len(no_majority_index[0])), (
           "probability sum is greater than expected for no_majority")

    # replace the "no_majority" samples with the class that resulted in the
    # largest probability
    majority[no_majority_index] = classes[np.argmax(no_majority_sum, axis=1)]

    return majority, np.max(all_probs_normalize, axis=1)

In [4]:
def load_classifier_list(clf):
    """
    Load a classifier
    Classifier is stored as list object
    Returns list of classifiers and their names
    """

    print("Loading classifier list...")
    # clean up input: remove "classifier/" and ".pkl"
    clf_id = clf[clf.find('/')+1:-4]

    clf_list, clf_names = [], []
    # this is a list of classifiers
    loaded_clf = load_classifier(CLASSIFIER_DIR + clf_id)
    for classifier in loaded_clf:
        clf_list.append(classifier)

        # get name via class structure
        clf_class = str(classifier.named_steps['clf'].__class__)

        # some basic cleaning of class name
        clf_name_indx = clf_class.find('.')
        clf_name = clf_class[clf_name_indx+1:-2]
        clf_names.append(clf_name)

    return clf_list, clf_names

In [5]:
def check_event_types(df):
    
    acceptable_event_types = np.array([
        'BasicAnnouncement', 'DatedAnnouncement', 'GreetingCard', 'LinkAway', 'RsvpEvent'])
    
    for i in df.event_type.values:
        assert (i in acceptable_event_types) == True, "Found event type that doesn't exist"

In [6]:
def check_primary_classes(y):
    
    acceptable_primary_classes = np.array([
        'birthday_celebration', 'greetings', 'organizations', 'other',
       'personal', 'wedding_related'])
    
    for i in y.primary.values:
        assert (i in acceptable_primary_classes) == True, "Found event type that doesn't exist"

In [7]:
def check_null(df):
    """
    Ensure no NULL values
    """
    
    assert np.sum(pd.isnull(df).values) == 0, "Some NULL values exist"

In [8]:
def clean_df(df):
    """
    Perform any necessary cleaning
    
    1) Remove other class types
    2) ensure no NULL values
    3) ensure event_types are corrects
    4) ensure primary classes are correct
    
    """
    
    df[~(df.s_class == 'other')].copy()
    
    
    check_null(df)
    check_event_types(df)
    check_primary_classes(y)
    
    return df
    

In [9]:
def get_X_and_y(df):
    """
    Input: dataframe based on query
    Output: X and y (type dataframe)
    """

    df = clean_df(df)
    
    X = pd.DataFrame([df.event_subject, df.event_text, df.event_type]).T
    X.columns = ['subject', 'text', 'event_type']

    y = pd.DataFrame([df.p_class, df.s_class, df.t_class]).T
    y.columns = ['primary', 'secondary', 'tertiary']

    assert X.shape[0] == y.shape[0], 'X and y must be of same dimension'

    
    return X, y

In [123]:
def save_secondary_features(feature_pipeline, level, id_num):
    """
    Save training features
    Need to create an id_num to know which classifier to load
    Loaded classifier must be the same as the feature id number
    """

    logger.info("Saving secondaryfitted feature pipeline id {}...".format(id_num))
    pickle_classifier(feature_pipeline, FEATURE_DIR + level + "_" + str(id_num))

    return id_num

In [12]:
def save_classifier(clf, level, id_num):
    """
    Save trained classifier
    """

    logger.info("Saving trained classifier id {}...".format(id_num))
    pickle_classifier(clf, CLASSIFIER_DIR + level + str(id_num))


In [13]:
# parameters for gridsearch
# using SVM currently
CLASSIFIER_PIPELINE = dict({
    'SVM': {
        'classifier': Pipeline([
            ("clf", SVC(probability=True)),
        ]),
        'params': {
            'clf__C': [1],
            'clf__kernel': ['linear']
        }
     },
})

In [14]:
def check_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix):
    
    assert (X.shape[1] == subject_matrix.shape[1] + text_matrix.shape[1] + \
            event_type_matrix.shape[1] + predicted_primary_matrix.shape[1]), \
    ("Number of x_train features doesn't match sum of component features")
    
    assert (X.shape[0] == subject_matrix.shape[0]),\
    ("Number of x_train samples doesn't match subject_matrix samples")

    assert (X.shape[0] == text_matrix.shape[0]), \
    ("Number of x_train samples doesn't match body text samples")
    
    assert (X.shape[0] == event_type_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")

    assert (X.shape[0] == predicted_primary_matrix.shape[0]),\
    ("Number of x_train samples doesn't match event_type_matrix samples")

    

In [15]:
def grid_search(X, y, gridsearch_pipeline):
    """
    Perform a Grid Search over the space of classifiers and their associated
    parameter space
    Inputs: X and y training sets
    Output: A list of the best classifiers from each classifier category
    """

    logger.info("starting Gridsearch...")

    best_classifiers = []
    names = []

    for v in gridsearch_pipeline.items():
        gs = GridSearchCV(v[1]['classifier'], v[1]['params'], verbose=2, cv=3, n_jobs=4)
        gs = gs.fit(X, y)
        names.append(v[0])
        logger.info("{} finished".format(v[0]))
        logger.info("Best scoring classifier: {}".format(gs.best_score_))
        best_classifiers.append(gs.best_estimator_)

    return best_classifiers, names

In [24]:
FEATURE_PIPELINE = dict({
    'subject_pipe':Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'text_pipe': Pipeline([
                    ('cleanText', CleanTextTransformer()),
                    ('vectorizer', CountVectorizer(tokenizer=tokenize_text,ngram_range=(1, 1))),
                    ('tfidf', TfidfTransformer())
                    ]),
    'event_type_pipe': Pipeline([('vectorizer', CountVectorizer())]),
    'predicted_primary': Pipeline([('vectorizer', CountVectorizer())])
})

In [25]:
def get_secondary_training_features(X, y, pipes):
    """
    Combines training features
    Using fit_transform on the pipeline object for each feature
    Performs sparse matrix concatenation
    """
    
    print("Beginning pipeline fit_transform to training data...")
    
    subject_matrix = pipes['subject_pipe'].fit_transform(X.subject, y)
    text_matrix = pipes['text_pipe'].fit_transform(X.text, y)
    event_type_matrix = pipes['event_type_pipe'].fit_transform(X.event_type, y)
    predicted_primary_matrix = pipes['predicted_primary'].fit_transform(X.pred_primary_class, y)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix])
    
    print("Completed fit_transform")
    
    check_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix)
    print("Training set dimension:", X.shape)
    
    return X, pipes

In [26]:
def get_secondary_testing_features(X, pipes):
    """
    Combines training features
    Key difference between this and training_features
    is that pipeline is transforming x_test not fit_transforming
    """
    
    print("Beginning transform of test set...")
    subject_matrix = pipes['subject_pipe'].transform(X.subject)
    text_matrix = pipes['text_pipe'].transform(X.text)
    event_type_matrix = pipes['event_type_pipe'].transform(X.event_type)
    predicted_primary_matrix = pipes['predicted_primary'].transform(X.pred_primary_class)
    
    X = hstack([subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix])
    
    print("Completed transform of test set.")
    
    check_dimensions(X, subject_matrix, text_matrix, event_type_matrix, predicted_primary_matrix)
    print("Testing set dimension:", X.shape)
    
    return X

In [37]:
def _check_prediction_dimensions(y_test, y_pred, y_score):
    """
    Ensure that prediction dimensions are correct
    """

    assert y_pred.shape[0] == y_test.shape[0], (
        "Ensure class prediction vector is same length as test set")
    assert y_score.shape[0] == y_test.shape[0], (
        "Ensure score prediction vector is same length as test set")


In [113]:
# load dataset
df = pd.read_pickle('pickles/primary_output/dataset_674.pkl')

In [114]:
df

Unnamed: 0,subject,text,event_type,true_primary_class,pred_primary_class,pred_primary_score,true_secondary_class,true_tertiary_class
16106,"Thank you, Noemi","dear noemi and family,\r\rit was so wonderful ...",GreetingCard,greetings,greetings,0.949226,everyday_greetings,thank_you
18656,Devin's First Birthday,devin is turning 1!\r\rplease join for a pool ...,RsvpEvent,birthday_celebration,birthday_celebration,0.961690,kids_birthday,1st_birthday_party
385,Alyse Bowser's Birthday Party,"alyse's \rbirthday \rparty\r\r\rsaturday, nove...",RsvpEvent,birthday_celebration,birthday_celebration,0.997493,general,general_birthday_party
2448,Nada's 8th Birthday,nada is turning 8!nada would like to invite yo...,RsvpEvent,birthday_celebration,birthday_celebration,0.989731,kids_birthday,tween_8_12_birthday_party
10250,To someone special,"dear husband,\ri love you still as if we’ve on...",GreetingCard,greetings,greetings,0.952382,everyday_greetings,birthday
15498,CSC Leasing Christmas Party,celebrate\rthe\rseason\rwith\rcsc leasing comp...,RsvpEvent,organizations,personal,0.592903,business_or_nonprofits,holiday_party
16988,Pie Breakfast 2016,you are invited\rto our annual pie breakfast!\...,RsvpEvent,personal,personal,0.610822,other_holiday_events,thanksgiving
18855,Bell by Alicia Bell & Autumn Hello,bell by alicia bell\rcustom shirt & shirtdress...,RsvpEvent,organizations,organizations,0.413422,business_or_nonprofits,marketing_event
26160,Wendy's 50th Birthday,"wendy's \r50th birthday\rsaturday, september 1...",RsvpEvent,birthday_celebration,birthday_celebration,0.976515,adult_birthday,adult_milestone_birthday_party
1103,Nicole's Birthday,let's celebrate \rnicole's \rbirthday in style...,RsvpEvent,birthday_celebration,birthday_celebration,0.972065,adult_birthday,other_adult_birthday_party


In [115]:
y = df['true_secondary_class']
X = df[['subject','text','event_type','pred_primary_class','pred_primary_score']]

In [116]:
import os
import time
from collections import Counter
import logging
import argparse

# helper modules
from ml_utils import pickle_classifier, load_classifier, ExtractFeature, \
                     precision_recall_matrix, get_classifier_results
from nlp_helper import CleanTextTransformer, tokenize_text
from query_events import execute_query

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


from scipy.sparse import coo_matrix, hstack

In [117]:
id_num = 674

In [118]:
X.head()

Unnamed: 0,subject,text,event_type,pred_primary_class,pred_primary_score
16106,"Thank you, Noemi","dear noemi and family,\r\rit was so wonderful ...",GreetingCard,greetings,0.949226
18656,Devin's First Birthday,devin is turning 1!\r\rplease join for a pool ...,RsvpEvent,birthday_celebration,0.96169
385,Alyse Bowser's Birthday Party,"alyse's \rbirthday \rparty\r\r\rsaturday, nove...",RsvpEvent,birthday_celebration,0.997493
2448,Nada's 8th Birthday,nada is turning 8!nada would like to invite yo...,RsvpEvent,birthday_celebration,0.989731
10250,To someone special,"dear husband,\ri love you still as if we’ve on...",GreetingCard,greetings,0.952382


In [119]:
# split training, testing
x_train, x_test, y_train, y_test, idx1, idx2 = train_test_split(X, y, X.index, test_size=0.4)

In [120]:
x_train_matrix, feature_pipeline = get_secondary_training_features(x_train, y_train, FEATURE_PIPELINE)
x_test_matrix = get_secondary_testing_features(x_test, feature_pipeline)

Beginning pipeline fit_transform to training data...
Completed fit_transform
Training set dimension: (11104, 38529)
Beginning transform of test set...
Completed transform of test set.
Testing set dimension: (7404, 38529)


In [121]:
y_tertiary = df.true_tertiary_class[idx2]

In [124]:
save_secondary_features(feature_pipeline, 'secondary', id_num)

NameError: name 'FEATURE_DIR' is not defined

In [125]:
x_train_matrix

<11104x38529 sparse matrix of type '<class 'numpy.float64'>'
	with 269136 stored elements in COOrdinate format>

In [126]:
y_train.value_counts()

everyday_greetings                                   2464
kids_birthday                                        1674
adult_birthday                                        975
business_or_nonprofits                                972
adult_or_family_events                                963
celebrating_baby_kids_or_parents_to_be                625
parties_and_showers_in_honor_of_the_bride_&_groom     525
seasonal_holiday_cards                                437
seasonal_holiday_events                               398
other_holiday_cards                                   390
wedding_events_hosted_by_bride_groom_or_family        364
other_holiday_events                                  312
alumni_or_school_related                              249
save_the_date                                         228
foreign_language                                      190
wedding_related_other                                 147
general                                                72
announcements_

In [127]:
clf_list, clf_names = grid_search(x_train_matrix, y_train, CLASSIFIER_PIPELINE)

Fitting 3 folds for each of 1 candidates, totalling 3 fits




[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] clf__C=1, clf__kernel=linear ....................................
[CV] ..................... clf__C=1, clf__kernel=linear, total=  55.8s
[CV] ..................... clf__C=1, clf__kernel=linear, total=  57.0s
[CV] ..................... clf__C=1, clf__kernel=linear, total=  57.4s


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  1.2min finished


In [128]:
results, classes = get_classifier_results(clf_list, clf_names, x_test_matrix, y_test)

Getting classifier results...


In [129]:
save_classifier(clf_list, 'secondary', id_num)

Saving classifier: /Users/iman/code/event_classifier/model/pickles/classifiers/secondary_classifier_674...


In [130]:
y_pred, y_score = get_ensemble_prediction(results, classes)
_check_prediction_dimensions(y_test, y_pred, y_score)

Getting ensemble predictions...


In [131]:
def _decorate_with_secondary(X, y_test, y_pred, y_score, y_tertiary):
    """
    Adds two columns to x_test
        predicted primary class
        predicted primary score
    Used as input to secondary classifier
    """

    # uses output of primary model
    X['true_secondary_class'] = y_test
    X['pred_secondary_class'] = y_pred
    X['pred_secondary_score'] = y_score
    X['true_tertiary_class'] = y_tertiary


    return X

In [133]:
y_tertiary = df.true_tertiary_class[idx2]

In [134]:
_decorate_with_secondary(x_test, y_test, y_pred, y_score, y_tertiary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

Unnamed: 0,subject,text,event_type,pred_primary_class,pred_primary_score,true_secondary_class,pred_secondary_class,pred_secondary_score,true_tertiary_class
4003,Wedding,"will you be my\rbridesmaid?\rdear lou, i could...",GreetingCard,wedding_related,0.813043,wedding_related_other,wedding_related_other,0.853052,other_wedding_related
38188,Merry Christmas!,"dear roland and susan,\ri hope you have a very...",GreetingCard,greetings,0.945836,seasonal_holiday_cards,seasonal_holiday_cards,0.968027,christmas
38913,Sebastian's 3rd Birthday,"sebastian's 3rd birthday\rsunday, january 29th...",RsvpEvent,birthday_celebration,0.967542,kids_birthday,kids_birthday,0.982516,little_kids_2_7_birthday_party
16224,Kay Lee Invitation,kay lee invites you for a glass of wine and to...,RsvpEvent,organizations,0.667299,business_or_nonprofits,business_or_nonprofits,0.821835,trunk_show_or_shopping_or_marketing_event
4693,Dawn Brown-Piatt's Wedding Shower,directions to leigh beauchamp's home:\r\rtruma...,RsvpEvent,wedding_related,0.869356,parties_and_showers_in_honor_of_the_bride_&_groom,parties_and_showers_in_honor_of_the_bride_&_groom,0.702263,wedding_shower
1958,Fredi's Birthday,rumour has it\r fredi prevost\ris turning 50...,RsvpEvent,birthday_celebration,0.996912,adult_birthday,adult_birthday,0.857333,other_adult_birthday_party
3541,Happy Birthday Little Lady!,have a slice (or two):\rit's your birthday. \r...,GreetingCard,greetings,0.990787,everyday_greetings,everyday_greetings,0.970410,birthday
9657,Halloween Fête,you are invited to a\rhalloween fete\rthe spoo...,RsvpEvent,personal,0.890758,other_holiday_events,other_holiday_events,0.907173,halloween
3991,Mischa and Amelia are turning 8,come and celebrate \ramelia and mischa turnin...,RsvpEvent,birthday_celebration,0.951438,kids_birthday,kids_birthday,0.995628,tween_8_12_birthday_party
40976,Katherine's Bridal Shower,join us for a bridal shower to celebrate\rkath...,RsvpEvent,wedding_related,0.956258,parties_and_showers_in_honor_of_the_bride_&_groom,parties_and_showers_in_honor_of_the_bride_&_groom,0.988883,wedding_shower


In [138]:
hierarchy.head()

Unnamed: 0,p_class,s_class,t_class
0,greetings,everyday_greetings,thank_you
1,personal,seasonal_holiday_events,christmas
2,wedding_related,parties_and_showers_in_honor_of_the_bride_&_groom,wedding_shower
3,greetings,everyday_greetings,birthday
4,greetings,other_holiday_cards,other_muslim_holiday


In [84]:
df = execute_query(QUERY_ALL)
hierarchy = df[['p_class', 's_class', 't_class']].drop_duplicates()

In [136]:
x_test.head()

Unnamed: 0,subject,text,event_type,pred_primary_class,pred_primary_score,true_secondary_class,pred_secondary_class,pred_secondary_score,true_tertiary_class
4003,Wedding,"will you be my\rbridesmaid?\rdear lou, i could...",GreetingCard,wedding_related,0.813043,wedding_related_other,wedding_related_other,0.853052,other_wedding_related
38188,Merry Christmas!,"dear roland and susan,\ri hope you have a very...",GreetingCard,greetings,0.945836,seasonal_holiday_cards,seasonal_holiday_cards,0.968027,christmas
38913,Sebastian's 3rd Birthday,"sebastian's 3rd birthday\rsunday, january 29th...",RsvpEvent,birthday_celebration,0.967542,kids_birthday,kids_birthday,0.982516,little_kids_2_7_birthday_party
16224,Kay Lee Invitation,kay lee invites you for a glass of wine and to...,RsvpEvent,organizations,0.667299,business_or_nonprofits,business_or_nonprofits,0.821835,trunk_show_or_shopping_or_marketing_event
4693,Dawn Brown-Piatt's Wedding Shower,directions to leigh beauchamp's home:\r\rtruma...,RsvpEvent,wedding_related,0.869356,parties_and_showers_in_honor_of_the_bride_&_groom,parties_and_showers_in_honor_of_the_bride_&_groom,0.702263,wedding_shower


In [151]:
def _enforce_hierarchy(row):
    
    primary, secondary, p_score, s_score = row['pred_primary_class'], row['pred_secondary_class'], row['pred_primary_score'], row['pred_secondary_score']
        
    expected_primary = np.unique(hierarchy[hierarchy.s_class == secondary].p_class)
    
    if expected_primary == primary:
        "Secondary hierarchy enforced"
        return primary, secondary
    else:
        if p_score > s_score:  # primary wins
            return primary, 'unknown'
        else:
            return expected_primary, secondary

In [156]:
def add_final_classes(X):
    """
    Enforce the hierarchy and add final classification
    """

    print("Adding final classifications to dataframe...")
    finals = X.apply(_enforce_hierarchy, axis=1)
    finals = np.array([np.array([i[0], i[1]]) for i in finals])

    X.loc[:, 'final_primary'] = finals[:, 0]
    X.loc[:, 'final_secondary'] = finals[:, 1]

    return X

In [157]:
final_x = add_final_classes(x_test)

Adding final classifications to dataframe...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [172]:
final_x

Unnamed: 0,subject,text,event_type,pred_primary_class,pred_primary_score,true_secondary_class,pred_secondary_class,pred_secondary_score,true_tertiary_class,final_primary,final_secondary
4003,Wedding,"will you be my\rbridesmaid?\rdear lou, i could...",GreetingCard,wedding_related,0.813043,wedding_related_other,wedding_related_other,0.853052,other_wedding_related,wedding_related,wedding_related_other
38188,Merry Christmas!,"dear roland and susan,\ri hope you have a very...",GreetingCard,greetings,0.945836,seasonal_holiday_cards,seasonal_holiday_cards,0.968027,christmas,greetings,seasonal_holiday_cards
38913,Sebastian's 3rd Birthday,"sebastian's 3rd birthday\rsunday, january 29th...",RsvpEvent,birthday_celebration,0.967542,kids_birthday,kids_birthday,0.982516,little_kids_2_7_birthday_party,birthday_celebration,kids_birthday
16224,Kay Lee Invitation,kay lee invites you for a glass of wine and to...,RsvpEvent,organizations,0.667299,business_or_nonprofits,business_or_nonprofits,0.821835,trunk_show_or_shopping_or_marketing_event,organizations,business_or_nonprofits
4693,Dawn Brown-Piatt's Wedding Shower,directions to leigh beauchamp's home:\r\rtruma...,RsvpEvent,wedding_related,0.869356,parties_and_showers_in_honor_of_the_bride_&_groom,parties_and_showers_in_honor_of_the_bride_&_groom,0.702263,wedding_shower,wedding_related,parties_and_showers_in_honor_of_the_bride_&_groom
1958,Fredi's Birthday,rumour has it\r fredi prevost\ris turning 50...,RsvpEvent,birthday_celebration,0.996912,adult_birthday,adult_birthday,0.857333,other_adult_birthday_party,birthday_celebration,adult_birthday
3541,Happy Birthday Little Lady!,have a slice (or two):\rit's your birthday. \r...,GreetingCard,greetings,0.990787,everyday_greetings,everyday_greetings,0.970410,birthday,greetings,everyday_greetings
9657,Halloween Fête,you are invited to a\rhalloween fete\rthe spoo...,RsvpEvent,personal,0.890758,other_holiday_events,other_holiday_events,0.907173,halloween,personal,other_holiday_events
3991,Mischa and Amelia are turning 8,come and celebrate \ramelia and mischa turnin...,RsvpEvent,birthday_celebration,0.951438,kids_birthday,kids_birthday,0.995628,tween_8_12_birthday_party,birthday_celebration,kids_birthday
40976,Katherine's Bridal Shower,join us for a bridal shower to celebrate\rkath...,RsvpEvent,wedding_related,0.956258,parties_and_showers_in_honor_of_the_bride_&_groom,parties_and_showers_in_honor_of_the_bride_&_groom,0.988883,wedding_shower,wedding_related,parties_and_showers_in_honor_of_the_bride_&_groom
