In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import scipy
 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.cross_validation import train_test_split 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.metrics import accuracy_score,hamming_loss,f1_score
from skmultilearn.problem_transform import LabelPowerset

import commonfunction


class trainmodel:
                    
    def __init__(self):        
        self.train_df = self.test_df = self.Y = pd.DataFrame()
        self.train_df = pd.read_csv("../../data/processed/train.csv")
        self.test_df = pd.read_csv("../../data/processed/test.csv")                    
        
    def transform_target(self):
        
        le  = LabelBinarizer()
        cat_arr = le.fit_transform(self.train_df['cat'])
        cat_df = pd.DataFrame(data=cat_arr,columns=le.classes_)        
        self.train_df['subcat'] = np.where(pd.isnull(self.train_df['subcat']),'NaN',self.train_df['subcat'])
        subcat_arr = le.fit_transform(self.train_df['subcat'])
        subcat_df = pd.DataFrame(data=subcat_arr,columns=le.classes_)    
        self.Y = pd.concat([cat_df,subcat_df],axis=1)        
        self.Y.drop('NaN',inplace=True,axis=1)
        #print('Y value', self.Y.columns)
        
    def prepare_data(self):
        
        col_todrop = ['cat','subcat','Unnamed: 0','document_id']
        for col in col_todrop:
            self.train_df.drop(col,axis=1,inplace=True)          
            
    def prepare_validationset(self):
        
        X_train, X_test, Y_train, Y_test = train_test_split(self.train_df['content'], self.Y, 
                                                            test_size=0.33, random_state=42)                 
        return(X_train, X_test, Y_train, Y_test)

    def createmodel(self):
        
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()
        #X_train, X_test, Y_train, Y_test = train_test_split(self.train_df['content'], self.Y,test_size=0.33, random_state=42)                             
        X_train_transposed =X_train.T
        self.pipe = Pipeline([
                    ('cv', CountVectorizer(ngram_range=(1, 3))),
                    ('tfidf', TfidfTransformer()),
                    ('LP_GNB', LabelPowerset(GaussianNB()))])
        # train
        self.pipe.fit(X_train_transposed, Y_train)
        
        # predict
        prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
        
        return()
    
    def termfreq(self,n_comp):
        
        tfidfvec = TfidfVectorizer(ngram_range=(1,3), strip_accents='unicode',
                       lowercase =True, analyzer='word', token_pattern=r'\w+',
                       stop_words = 'english')
        
        tfidf = tfidfvec.fit_transform(self.train_df['content'])  
        self.train_df.drop('content',inplace=True,axis=1)
        self.train_df = pd.DataFrame(tfidf.toarray(), columns=tfidfvec.get_feature_names())
        
        '''
        df1 = pd.DataFrame(tfidf.toarray(), columns=tfidfvec.get_feature_names())
        self.train_df = pd.concat([self.train_df, df1], axis=1)
        svdT = TruncatedSVD(n_components=n_comp)
        self.train_df = svdT.fit_transform(self.train_df) 
        
        
        test_tfidf = tfidfvec.transform(self.test_df['content'])
        self.test_df.drop('content',inplace=True,axis=1)
        df1 = pd.DataFrame(test_tfidf.toarray(), columns=tfidfvec.get_feature_names())
        self.test_df = pd.concat([self.test_df, df1], axis=1)
        self.test_df = svdT.transform(self.test_tfidf)                 
        '''
        
    def createmodel_2(self):
                        
        self.termfreq()                      
        clf = LabelPowerset(GaussianNB())
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()
        clf.fit(X_train,Y_train)
        prediction = clf.predict(X_test)
        
        # predict
        # prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
        
        return()
    
    def prepare_testdata(self,fname):
        
        self.test_df = pd.read_csv(fname)         
        self.test_id = self.test_df['document_id']
        
        col_todrop = ['document_id','Unnamed: 0']
        for col in col_todrop:
            self.test_df.drop(col,axis=1,inplace=True)            
        
    
    def predict_testdataset(self):
         
        test_predicitions = self.pipe.predict(self.test_df['content'])         
        return(test_predicitions)
    
    
    def get_testdata(self):
        return(self.test_df)
    
    def get_traindata(self):
        return(self.train_df)
    
    def get_target(self):
        return(self.Y)
    



In [None]:
model = trainmodel()
model.transform_target()
 
model.prepare_data()


#Read the data for which the label to be predicted......
model.prepare_testdata("../../data/processed/test.csv")

#test_result is the predicted label 
test_result = model.predict_testdataset()

#Convert the binary n-array to the corresponding label
#resultant array will have two columns - cat, subcat

data = np.zeros(shape=[test_result.shape[0],2])
data_str = data.astype(str)

cx = scipy.sparse.coo_matrix(test_result)
for i,j,v in zip(cx.row, cx.col, cx.data):      
    if(data_str[i][0] == '0.0'):
        #print("Inside if stmt ...... ",i,j)
        data_str[i][0] = commonfunction.getlabelname(j)
    else:
        #print("Inside if else ...... ",i,j)
        data_str[i][1] = commonfunction.getlabelname(j)         

test_df = pd.read_csv("../../data/processed/test.csv")
test_id = test_df['document_id']
label_df = pd.DataFrame({' ':test_id,'cat':data_str[:,0],'subcat':data_str[:,1]})
label_df.to_csv("../../data/processed/predicted.csv")

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X): 
        return X[self.key]

In [3]:
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = trainmodel()
model.transform_target()
model.prepare_data()

train_df  = model.get_traindata()
target = model.get_target()

X_train, X_test, Y_train, Y_test = train_test_split(train_df,target, test_size=0.33, random_state=42) 

In [84]:
tfidf = TfidfVectorizer(analyzer='word',min_df=0.3,
                              norm='l2',ngram_range=(1,3),sublinear_tf=True, use_idf=True)
tfidf_vec = tfidf.fit_transform(train_df['content'])

In [4]:
from xgboost import XGBClassifier
pipeline1 = Pipeline([
    ('select_column', ColumnSelector(key='content')),
    ('tfidf', TfidfVectorizer(analyzer='word',
                              norm='l2',ngram_range=(1,3),sublinear_tf=True, use_idf=True)),
    ( 'svd', TruncatedSVD() ),
    ( 'LP_GNB', LabelPowerset(GaussianNB()) )
    
])

pipeline2 = Pipeline([
    ('select_column', ColumnSelector(key='content')),
    ('cvec',CountVectorizer(ngram_range=(1,3))),
    ( 'svd', TruncatedSVD() ),
    ('classifier',LabelPowerset(XGBClassifier())),        
])
 
# this is where you define the values for
# GridSearchCV to iterate over
param_grid = {      
    
    'svd__n_components': [500,1000],
    #so called `eta` value  
    'classifier__classifier__max_depth': [6],
    'classifier__classifier__learning_rate': [0.05,0.001],
    'classifier__classifier__max_depth': [6],
    'classifier__classifier__min_child_weight': [11],              
    'classifier__classifier__subsample': [0.8],
    'classifier__classifier__colsample_bytree': [0.7],
    'classifier__classifier__n_estimators': [100]   
}


grid = GridSearchCV(pipeline2, cv=3, param_grid=param_grid) 
grid.fit(X_train,Y_train)
# test the classifier
prediction = grid.predict(X_test)
print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))

Hamming loss is 0.0363957030624
validation dataset accuracy is 0.75468975469


In [None]:
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ])),
    ('classifier', PipelineHelper([
        ('svm', LinearSVC()),
        ('rf', RandomForestClassifier()),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
        'max__copy': [True],  # just for displaying
    }),
    'classifier__selected_model': pipe.named_steps['classifier'].generate({
        'svm__C': [0.1, 1.0],
        'rf__n_estimators': [100, 20],
    })
}
grid = GridSearchCV(pipe, params, scoring='accuracy', verbose=1)
grid.fit(X_iris, y_iris)
print(grid.best_params_)

In [89]:
grid.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('select_column', ColumnSelector(key='content')), ('cvec', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, ...ol=0.0)), ('LP_GNB', LabelPowerset(classifier=GaussianNB(priors=None), require_dense=[True, True]))])>

In [None]:
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
# Start training and prediction.......
model = trainmodel()
model.transform_target()
model.prepare_data()
print("------ in tf-idf --------")
model.termfreq(200)

train_df = model.get_traindata()
target = model.get_target()

X_train, X_test, Y_train, Y_test = train_test_split(train_df,target, test_size=0.33, random_state=42) 

# train the classifier
model = pipe.fit(X_train.T, Y_train)

# test the classifier
prediction = model.predict(X_test)

In [22]:
# test the classifier
prediction = grid.predict(X_test)
print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))

Hamming loss is 0.0599647266314
validation dataset accuracy is 0.621933621934
