In [1]:
import pandas as pd
import numpy as np
import string,gc
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
import scipy, os
 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.cross_validation import train_test_split 

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,hamming_loss,f1_score,confusion_matrix

from skmultilearn.problem_transform import LabelPowerset

import commonfunction
import logging
import logging.config



In [2]:
logging.basicConfig()
logging.config.fileConfig(os.path.abspath('../config/logging.conf'))
logger = logging.getLogger(__name__)

In [3]:
class trainmodel:
                    
    def __init__(self):        
        self.train_df = self.test_df = self.Y = pd.DataFrame()
        self.train_df = pd.read_csv("../../data/processed/train.csv")
        self.test_df = pd.read_csv("../../data/processed/test.csv")
        logger.debug("The column in train set %s", self.train_df.columns)
        
    def transform_target(self):
        
        le  = LabelBinarizer()
        cat_arr = le.fit_transform(self.train_df['cat'])
        cat_df = pd.DataFrame(data=cat_arr,columns=le.classes_)
        logger.debug('cat classes %s',le.classes_)
        
        self.train_df['subcat'] = np.where(pd.isnull(self.train_df['subcat']),'NaN',self.train_df['subcat'])
        subcat_arr = le.fit_transform(self.train_df['subcat'])
        subcat_df = pd.DataFrame(data=subcat_arr,columns=le.classes_)
        logger.debug('subcat classes %s',le.classes_)
        self.Y = pd.concat([cat_df,subcat_df],axis=1)        
        #self.Y.drop('NaN',inplace=True,axis=1)
        logger.debug('Y value %s', self.Y.columns)
        
    def removeCol_fromtrain(self):
        
        col_todrop = ['cat','subcat','Unnamed: 0','document_id','no_pages', u'ques_cnt', u'no_words']
        for col in col_todrop:
            self.train_df.drop(col,axis=1,inplace=True)          
            
    def prepare_validationset(self):
        
        X_train, X_test, Y_train, Y_test = train_test_split(self.train_df['content'], self.Y, 
                                                            test_size=0.33, random_state=42)                 
        return(X_train, X_test, Y_train, Y_test)

    def createmodel(self):
        
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()         
        X_train_transposed =X_train.T
        self.pipe = Pipeline([
                    ('cv', CountVectorizer(ngram_range=(1, 3))),
                    ('tfidf', TfidfTransformer( use_idf=False)),
                    ('LP_GNB', LabelPowerset(GaussianNB()))])
        # train
        self.pipe.fit(X_train_transposed, Y_train)
        
        # predict
        prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
        print('---- confusion matrix ----- ')
        
        #valset_pred = prediction.todense()
        
        self.print_confusionmatrix(Y_test,prediction)                 
        
        return()
    
    def termfreq(self):
        
        tfidfvec = TfidfVectorizer(ngram_range=(1,3), strip_accents='unicode',
                       lowercase =True, analyzer='word', token_pattern=r'\w+',
                       stop_words = 'english')
        
        full_tfidf = tfidfvec.fit_transform(self.train_df['content'].values.tolist() + self.test_df['content'].values.tolist())
        train_tfidf = tfidfvec.fit_transform(self.train_df['content'])  
        self.train_df.drop('content',inplace=True,axis=1)
        logger.debug("---- The size of train_tfidf vector is ------ %s ",train_tfidf.shape)
        
        test_tfidf = tfidfvec.fit_transform(self.test_df['content'])  
        self.test_df.drop('content',inplace=True,axis=1)                 
         
        n_comp = 750
        svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
        svd_obj.fit(full_tfidf)
        train_svd = pd.DataFrame(svd_obj.fit_transform(train_tfidf))
        test_svd = pd.DataFrame(svd_obj.fit_transform(test_tfidf))
    
        train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
        test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]
        #print('Train dataset head', self.train_df.head(10))
        self.train_df = pd.concat([self.train_df, train_svd], axis=1)
        self.test_df = pd.concat([self.test_df, test_svd], axis=1)
        #print('Train dataset head', self.train_df.head(10))
        del full_tfidf, train_tfidf, test_tfidf, train_svd, test_svd
        gc.collect()
         
    def createmodel_2(self):
                        
        self.termfreq()              
         
        self.clf = LabelPowerset(GaussianNB())
        X_train, X_test, Y_train, Y_test =  self.prepare_validationset()
        self.clf.fit(X_train,Y_train)
        prediction = self.clf.predict(X_test)
        
        # predict
        # prediction = self.pipe.predict(X_test)
         
        print('validation dataset accuracy is {}'.format(accuracy_score(Y_test, prediction)))           
        print('Hamming loss is {}'.format(hamming_loss(Y_test, prediction)))
        print('F1 Score is {}'.format(f1_score(Y_test, prediction,average='samples')))
       
        return()
    
    def prepare_testdata(self,fname):
        
        
        logger.debug("--- Before ---- %s",self.test_df.shape)
        self.test_id = self.test_df['document_id']
        
        col_todrop = ['document_id','Unnamed: 0']
        for col in col_todrop:
            self.test_df.drop(col,axis=1,inplace=True)
            
        logger.debug("--- After ---- %s",self.test_df.shape)
        
    
    def predict_testdataset(self):
         
        #test_predicitions = self.pipe.predict(self.test_df['content'])
        print("The testdataset ", self.test_df.columns)
        test_predicitions = self.clf.predict(self.test_df)
        logger.debug('test predictions shape %s', test_predicitions.shape)
        
        return(test_predicitions)
    
    def predict_testdataset_1(self):
         
        #test_predicitions = self.pipe.predict(self.test_df['content'])
        print("The testdataset ", self.test_df.columns,"the testdataset shape ", self.test_df.shape)
        test_predicitions = self.pipe.predict(self.test_df['content'])
        print("Test Prediction size ==== ", test_predicitions.shape)
        logger.debug('test predictions shape %s', test_predicitions.shape)       
        
        return(test_predicitions)
    
    
    def get_testdata(self):
        return(self.test_df)
    
    def get_traindata(self):
        return(self.train_df)
    
    def get_binarizedactuals(self):
        
        act_lab = pd.read_csv("../../data/processed/test_label.csv")
        act_labels = act_lab[['cat','subcat']]

        le  = LabelBinarizer()
        cat_arr = le.fit_transform(act_labels['cat'])
        cat_df = pd.DataFrame(data=cat_arr,columns=le.classes_)        
        act_labels['subcat'] = np.where(pd.isnull(act_labels['subcat']),'NaN',act_labels['subcat'])
        subcat_arr = le.fit_transform(act_labels['subcat'])
        subcat_df = pd.DataFrame(data=subcat_arr,columns=le.classes_)
        actual_y = pd.concat([cat_df,subcat_df],axis=1) 
        
        return(actual_y)
    
    def print_confusionmatrix(self,actual,predicted):
        
        predicted = predicted.todense()
        for i in range(actual.shape[1]):
            print("Confusion matrix of {}".format(actual.columns[i]))
            print(confusion_matrix(actual.iloc[:,i], predicted[:,i]))             
            print("")
        

In [4]:
# Start training and prediction.......

logger.info("Training starts ...... ")

model = trainmodel()
model.transform_target()
model.removeCol_fromtrain()
model.createmodel() 

#test_result is the predicted label 
test_result = model.predict_testdataset_1()

#Print the confusion matrix for the test dataset. 
res_y = model.get_binarizedactuals()
model.print_confusionmatrix(res_y,test_result)

2018-07-23 15:13:46,146 - __main__ - INFO - Training starts ...... 
2018-07-23 15:13:46,181 - __main__ - DEBUG - The column in train set Index([u'Unnamed: 0', u'content', u'document_id', u'no_pages', u'ques_cnt',
       u'no_words', u'cat', u'subcat'],
      dtype='object')
2018-07-23 15:13:46,185 - __main__ - DEBUG - cat classes ['Billing' 'Clinic/Doctors Office' 'Employment' 'Hospitalization'
 'InsuranceDisability' 'Military' 'NRLs']
2018-07-23 15:13:46,190 - __main__ - DEBUG - subcat classes ['CardiacData' 'Consents' 'Correspondence' 'EmergencyRoomRecord'
 'Insurance/DisabilityCorrespondence' 'LaboratoryReports' 'NaN'
 'Nurses/Notes/TelephoneLogs' 'Physicians/Orders'
 'Prescriptions/MedicationLists' 'ProgressNotes'
 'X-Rays/MRIs/CTscansRadiologyQuestionnaries']
2018-07-23 15:13:46,191 - __main__ - DEBUG - Y value Index([u'Billing', u'Clinic/Doctors Office', u'Employment', u'Hospitalization',
       u'InsuranceDisability', u'Military', u'NRLs', u'CardiacData',
       u'Consents', u'C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
#Convert the binary n-array to the corresponding label -
#dataset will be of shape - [len,col], col -(cat,subcat)
 
#The sparse predicted array is converted to array of size(len,col)    
data = np.zeros(shape=[test_result.shape[0],2])
data_str = data.astype(str)

cx = scipy.sparse.coo_matrix(test_result)
for i,j,v in zip(cx.row, cx.col, cx.data):      
    if(data_str[i][0] == '0.0'):
        #print("Inside if stmt ...... ",i,j)
        data_str[i][0] = commonfunction.getlabelname(j)
    else:
        #print("Inside if else ...... ",i,j)
        data_str[i][1] = commonfunction.getlabelname(j)
     
    

# Write the prediction into the file - ../../data/processed/predicted.csv
test_df = pd.read_csv("../../data/processed/test.csv")
test_id = test_df['document_id']
label_df = pd.DataFrame({' ':test_id,'cat':data_str[:,0],'subcat':data_str[:,1]})
label_df.to_csv("../../data/processed/predicted.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
f,ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm, annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()
plt.savefig('graph.png')