# Hierarchical Classification


There are essentially two types of hierachical:

In [None]:
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 20, 9

In [None]:
from IPython.display import Image

One is to ignore all taxomy information, and do a single multi class classification.
Otherwise

In [None]:
Image(filename='./assets/Classification_flat.png')

Is the other method is to train a classifier in each branch. So there we will need to build and train 3 different classifiers. 

In [None]:
Image(filename='./assets/Classification.png') 

We will focus on building a simple heirarchical model based on on this branching.

1. We will first build three classifiers and evaluate the error independently.
2. Then we will take a look at the error propogation.

In [None]:
from dao import DataAccess, LabelGetter

In [None]:
XX = DataAccess.get_as_dataframe()
XX.head()

# GridSearch for RandomForests and Logisitic Regression

In [None]:
import pylab as plt
import pandas as pd

from scipy.stats import uniform, randint

from pipelines.alcohol import AlcoholPipeline

from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
def n_grams(start, end):
    for a in range(1, start+1):
        for b in range(1, end+1):
                if a < b:
                    yield (a,b)

In [None]:
data = LabelGetter(XX)
X, y = data.get_alcohol()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

### Logistic Regression

In [None]:
clf = AlcoholPipeline(global_features=["text"]).pipeline(LogisticRegression())#.set_params(**params)

In [None]:
params = {
'clf__C': uniform(0.01, 1000),
'features__text__tfidf__analyzer':['word', 'char'],
'features__text__tfidf__lowercase': [False, True],
'features__text__tfidf__max_features': list(range(10000, 100000, 1000)),
'features__text__tfidf__ngram_range': list(n_grams(3, 14)),
'features__text__tfidf__norm': ['l2']
}

In [None]:
lr_clf_rgs = RandomizedSearchCV(clf, params, n_iter=60, n_jobs=4, verbose=1, scoring="f1")

In [None]:
lr_clf_rgs.fit(X_train, y_train)

In [None]:
lr_clf_rgs.best_params_

In [None]:
lr_clf_rgs.best_score_

### Random Forest

In [None]:
clf = AlcoholPipeline(global_features=["text"]).pipeline(RandomForestClassifier(
        n_jobs=4
    ))

In [None]:
params = {
 'clf__class_weight': ,
 'clf__criterion': 'gini',
 'clf__max_depth': list(range(10, 400, 5)),
 'clf__max_features': 'auto',
 'clf__min_samples_leaf': range(1, 5),
 'clf__min_samples_split': range(1, 5),
 'clf__n_estimators': list(range(100, 1000, 50)),
'features__text__tfidf__analyzer':['word', 'char'],
'features__text__tfidf__lowercase': [False, True],
'features__text__tfidf__max_features': list(range(10000, 100000, 1000)),
'features__text__tfidf__ngram_range': list(n_grams(3, 14)),
'features__text__tfidf__norm': ['l2']
}

In [None]:
rf_clf_rgs = RandomizedSearchCV(clf, params, n_iter=60, n_jobs=4, verbose=1, scoring="f1")

## Checking Features Importances using AUC

In [None]:
features = ["text", "time", "user", "age"]

In [None]:
def create_roc(features):
    clf = AlcoholPipeline(global_features=features).pipeline(
    RandomForestClassifier(
        n_estimators=400,
        class_weight="auto")
    )
    #if "text" in features:
    #    clf.set_params(**text_params)
    clf.fit(X_train, y_train)
    preds = clf.predict_proba(X_test)[:,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, preds)
    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label='|'.join(features) + '(area = %0.2f)' % roc_auc)
    return None

In [None]:
plt.hold(True)
create_roc(["time"])
create_roc(["user"])
create_roc(["text"])
create_roc(["age"])
create_roc(["text", "user", "age"])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for Features')
plt.legend(loc="lower right")
plt.show()