In [142]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import scipy
 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.cross_validation import train_test_split 
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,f1_score
from skmultilearn.problem_transform import LabelPowerset

import commonfunction


class trainmodel:
                    
    def __init__(self):        
        self.train_df = self.test_df = self.Y = pd.DataFrame()
        self.train_df = pd.read_csv("../../data/processed/train.csv")
        self.test_df = pd.read_csv("../../data/processed/test.csv")                    
        
    def transform_target(self):
        
        le  = LabelBinarizer()
        cat_arr = le.fit_transform(self.train_df['cat'])
        cat_df = pd.DataFrame(data=cat_arr,columns=le.classes_)        
        self.train_df['subcat'] = np.where(pd.isnull(self.train_df['subcat']),'NaN',self.train_df['subcat'])
        subcat_arr = le.fit_transform(self.train_df['subcat'])
        subcat_df = pd.DataFrame(data=subcat_arr,columns=le.classes_)    
        self.Y = pd.concat([cat_df,subcat_df],axis=1)        
        self.Y.drop('NaN',inplace=True,axis=1)
        #print('Y value', self.Y.columns)
        
    def prepare_data(self):
        
        col_todrop = ['cat','subcat','Unnamed: 0','document_id']
        for col in col_todrop:
            self.train_df.drop(col,axis=1,inplace=True)          
            
    def prepare_validationset(self):
        
        X_train, X_test, Y_train, Y_test = train_test_split(self.train_df['content'], self.Y, 
                                                            test_size=0.33, random_state=42)                 
        return(X_train, X_test, Y_train, Y_test)

    def createmodel(self):
        
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()
        #X_train, X_test, Y_train, Y_test = train_test_split(self.train_df['content'], self.Y,test_size=0.33, random_state=42)                             
        X_train_transposed =X_train.T
        self.pipe = Pipeline([
                    ('cv', CountVectorizer(ngram_range=(1, 3))),
                    ('tfidf', TfidfTransformer()),
                    ('LP_GNB', LabelPowerset(GaussianNB()))])
        # train
        self.pipe.fit(X_train_transposed, Y_train)
        
        # predict
        prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
        
        return()
    
    def termfreq(self,n_comp):
        
        tfidfvec = TfidfVectorizer(ngram_range=(1,3), strip_accents='unicode',
                       lowercase =True, analyzer='word', token_pattern=r'\w+',
                       stop_words = 'english')
        
        tfidf = tfidfvec.fit_transform(self.train_df['content'])  
        self.train_df.drop('content',inplace=True,axis=1)
        self.train_df = pd.DataFrame(tfidf.toarray(), columns=tfidfvec.get_feature_names())
        
        '''
        df1 = pd.DataFrame(tfidf.toarray(), columns=tfidfvec.get_feature_names())
        self.train_df = pd.concat([self.train_df, df1], axis=1)
        svdT = TruncatedSVD(n_components=n_comp)
        self.train_df = svdT.fit_transform(self.train_df) 
        
        
        test_tfidf = tfidfvec.transform(self.test_df['content'])
        self.test_df.drop('content',inplace=True,axis=1)
        df1 = pd.DataFrame(test_tfidf.toarray(), columns=tfidfvec.get_feature_names())
        self.test_df = pd.concat([self.test_df, df1], axis=1)
        self.test_df = svdT.transform(self.test_tfidf)                 
        '''
        
    def createmodel_2(self):
                        
        self.termfreq()                      
        clf = LabelPowerset(GaussianNB())
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()
        clf.fit(X_train,Y_train)
        prediction = clf.predict(X_test)
        
        # predict
        # prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
        
        return()
    
    def prepare_testdata(self,fname):
        
        self.test_df = pd.read_csv(fname)         
        self.test_id = self.test_df['document_id']
        
        col_todrop = ['document_id','Unnamed: 0']
        for col in col_todrop:
            self.test_df.drop(col,axis=1,inplace=True)            
        
    
    def predict_testdataset(self):
         
        test_predicitions = self.pipe.predict(self.test_df['content'])         
        return(test_predicitions)
    
    
    def get_testdata(self):
        return(self.test_df)
    
    def get_traindata(self):
        return(self.train_df)
    
    def get_target(self):
        return(self.Y)
    

In [143]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        return X[self.key]

In [144]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [145]:
class EstimatorSelectionHelper:
    
    def __init__(self,pipe,models, params):
        
        '''
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        '''
        self.models = models
        self.params = params
        self.pipe = pipe
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        
        for key in self.keys:
            
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]   
            self.pipe.named_steps['classifier'].set_params(classifier=model)             
            gs = GridSearchCV(self.pipe, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    
    
    def score_summary(self, sort_by='mean_score'):
        
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            new = {}            
            new.update(params) 
            new.update(d)
            return pd.Series(new)
            #return pd.Series(dict(list(**params.items()) + list(**d.items()) ) )
            
                      
        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))
        
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]

In [146]:
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = trainmodel()
model.transform_target()
model.prepare_data()

train_df  = model.get_traindata()
target = model.get_target()

X_train, X_test, Y_train, Y_test = train_test_split(train_df,target, test_size=0.33, random_state=42) 

In [147]:
from xgboost import XGBClassifier

pipeline  = Pipeline([
    ('select_column', ColumnSelector(key='content')),
    ('cvec',CountVectorizer(ngram_range=(1,3))),
    ('svd', TruncatedSVD(n_components=1000)),
    ('classifier', LabelPowerset())      
])
models1 = {     
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    #'GradientBoostingClassifier': GradientBoostingClassifier(),
    #'SVC': SVC()
}

params1 = {          
    'RandomForestClassifier': {'classifier__classifier__max_depth': [16, 32]},
    'AdaBoostClassifier' :  {'classifier__classifier__n_estimators': [16, 32]}
}

'''
models1 = {     
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    #'SVC': SVC()
}

params1 = {     
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },    
}

'''

helper1 = EstimatorSelectionHelper(pipeline, models1, params1)
helper1.fit(X_train, Y_train, scoring='f1_weighted', n_jobs=2)

helper1.score_summary()

Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:  4.3min finished


AdaBoostClassifier
RandomForestClassifier


[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:  4.2min finished
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,classifier__classifier__max_depth,classifier__classifier__n_estimators
3,RandomForestClassifier,0.588073,0.595691,0.604637,0.00682695,32.0,
2,RandomForestClassifier,0.574378,0.590844,0.613742,0.016701,16.0,
0,AdaBoostClassifier,0.191262,0.295278,0.415199,0.0921109,,16.0
1,AdaBoostClassifier,0.186697,0.293286,0.413788,0.0932303,,32.0


In [91]:
# test the classifier
prediction = helper1.predict(X_test)

print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))

AttributeError: EstimatorSelectionHelper instance has no attribute 'predict'

In [50]:
pipeline2 = Pipeline([
    ('select_column', ColumnSelector(key='content')),
    ('cvec',CountVectorizer(ngram_range=(1,3))),
    ( 'svd', TruncatedSVD() ),
    ('classifier',LabelPowerset()),        
])

In [51]:
clf.set_params(classifier='RandomForestClassifier()')

LabelPowerset(classifier='RandomForestClassifier()',
       require_dense=[True, True])

In [55]:
pipeline2.named_steps['classifier'].set_params(classifier='LogisticRegression()')

LabelPowerset(classifier='LogisticRegression()', require_dense=[True, True])

In [53]:
pipeline2.steps.append(['classifier',clf])

In [56]:
pipeline2

Pipeline(memory=None,
     steps=[('select_column', ColumnSelector(key='content')), ('cvec', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, ...ue])), ['classifier', LabelPowerset(classifier='LogisticRegression()', require_dense=[True, True])]])

In [40]:
clf = LabelPowerset()
clf.set_params(classifier='RandomForestClassifier()')
clf.set_params(classifier='LogisticRegression()')
print(clf.classifier)

LogisticRegression()


In [None]:
model = trainmodel()
model.transform_target()
 
model.prepare_data()


#Read the data for which the label to be predicted......
model.prepare_testdata("../../data/processed/test.csv")

#test_result is the predicted label 
test_result = model.predict_testdataset()

#Convert the binary n-array to the corresponding label
#resultant array will have two columns - cat, subcat

data = np.zeros(shape=[test_result.shape[0],2])
data_str = data.astype(str)

cx = scipy.sparse.coo_matrix(test_result)
for i,j,v in zip(cx.row, cx.col, cx.data):      
    if(data_str[i][0] == '0.0'):
        #print("Inside if stmt ...... ",i,j)
        data_str[i][0] = commonfunction.getlabelname(j)
    else:
        #print("Inside if else ...... ",i,j)
        data_str[i][1] = commonfunction.getlabelname(j)         

test_df = pd.read_csv("../../data/processed/test.csv")
test_id = test_df['document_id']
label_df = pd.DataFrame({' ':test_id,'cat':data_str[:,0],'subcat':data_str[:,1]})
label_df.to_csv("../../data/processed/predicted.csv")

In [30]:
# train the classifier
model = pipe.fit(X_train.T, Y_train)

# test the classifier
prediction = model.predict(X_test)

print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))

Hamming loss is 0.0611672278339
validation dataset accuracy is 0.265512265512
