In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 27 09:43:55 2018

@author: Anshuman_Mahapatra
"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import pandas as pd
import re
#import csv
import string
from scipy.sparse import lil_matrix, find
import itertools
#from pyjarowinkler import distance
from sklearn.feature_selection import SelectPercentile
import os.path
import hashlib
import pickle
import filelock
from io import StringIO



stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_df):
        return data_df[self.key]
    
    def get_feature_names():
       return []

class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': (text.count('.') + \
                                   text.count('?') + \
                                   text.count('!'))}
                for text in posts]
        
    def get_feature_names(self):
       return ['length','num_sentences']

class TargetSimilarity(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""
    def __init__(self, target,stop_words = None, ngram_range = (1,3),use_idf = False):
        self.target = target
        self.stop_words = stop_words
        self.ngram_range = ngram_range
        self.use_idf = use_idf
        
    def fit(self, x, y=None):
        return self

    def transform(self, text):
        text_target = np.append(text,self.target)
        count_vect = StemmedCountVectorizer(stop_words = self.stop_words, 
                                     ngram_range = self.ngram_range)
        counts = count_vect.fit_transform(text_target)
        
        # TF-IDF
        tfidf_transformer = TfidfTransformer(use_idf = self.use_idf)
        tfidf = tfidf_transformer.fit_transform(counts)
        #tfidf = TfidfVectorizer().fit_transform(text_target)
        #cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
        cosine_similarities = (tfidf * tfidf.T).A
        #squareform(pdist(tfidf.toarray(), 'cosine'))
        return cosine_similarities[:-len(self.target),len(text):]
    
class MyModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        return self

    def transform(self, X):
        return self.model.predict_proba(X)

class NumberTransformer(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, text_col):
        num_text_col = text_col.replace(to_replace=re.compile('(?:(?<=\s)|(?<=^)|(?<=[^0-9a-zA-Z]))[0-9][0-9.,\-\/]*(?:(?=\s)|(?=$)|(?=[^0-9a-zA-Z]))',flags = re.IGNORECASE),
                    value='NUMBERSPECIALTOKEN',inplace=False,regex=True)
        return num_text_col

# =============================================================================
#         text_col.replace(to_replace=re.compile('(?:(?<=\s)|(?<=^)|(?<=[^0-9a-zA-Z]))[0-9][0-9.,\-\/]*(?:(?=\s)|(?=$)|(?=[^0-9a-zA-Z]))',flags = re.IGNORECASE),
#                     value='NUMBERSPECIALTOKEN',inplace=True,regex=True)
#         return text_col
# =============================================================================
        
    def get_feature_names(self):
       return None

class DateTransformer(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, text_col):
# =============================================================================
#         text_col.replace(to_replace=re.compile('(?:(?<=\s)|(?<=^)|(?<=[^0-9a-zA-Z]))(\d+[/-]\d+[/-]\d+)(?:(?=\s)|(?=$)|(?=[^0-9a-zA-Z]))',flags = re.IGNORECASE),
#                 value='DATESPECIALTOKEN',inplace=True,regex=True)
# =============================================================================
        pattern=self.getDatePattern()
        date_text_col = text_col.replace(to_replace=re.compile(pattern,flags = re.IGNORECASE),
                 value='DATESPECIALTOKEN',inplace=False,regex=True)
        return date_text_col
    
    def getDatePattern(self):
        short_month_names = (
            'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
            'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
        )
        
        long_month_names = (
            'January', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December'
        )
        
        short_month_cap = '(?:' + '|'.join(short_month_names) + ')'
        long_month_cap = '(?:' + '|'.join(long_month_names) + ')'
        short_num_month_cap = '(?:[1-9]|1[12])'
        long_num_month_cap = '(?:0[1-9]|1[12])'
        
        long_day_cap = '(?:0[1-9]|[12][0-9]|3[01])'
        short_day_cap = '(?:[1-9]|[12][0-9]|3[01])'
        
        long_year_cap = '(?:[0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]|[0-9][1-9][0-9]{2}|[1-9][0-9]{3})'
        short_year_cap = '(?:[0-9][0-9])'
        
        ordinal_day = '(?:2?1st|2?2nd|2?3rd|[12]?[4-9]th|1[123]th|[123]0th|31st)'
        spl_char='\W{1}'
        
        formats = (
            r'(?P<month_0>{lnm}|{snm}){sp_c}(?P<day_0>{ld}|{sd}){sp_c}(?P<year_0>{ly}|{sy})',
            r'(?P<month_1>{sm})\-(?P<day_1>{ld}|{sd})\-(?P<year_1>{ly})',
            r'(?P<month_2>{sm}|{lm})(?:\.\s+|\s*)(?P<day_2>{ld}|{sd})(?:,\s+|\s*)(?P<year_2>{ly})',
            r'(?P<day_3>{ld}|{sd})(?:[\.,]\s+|\s*)(?P<month_3>{lm}|{sm})(?:[\.,]\s+|\s*)(?P<year_3>{ly})',
            r'(?P<month_4>{lm}|{sm})\s+(?P<year_4>{ly})',
            r'(?P<month_5>{lnm}|{snm})/(?P<year_5>{ly})',
            r'(?P<year_6>{ly})',
            r'(?P<month_6>{sm})\s+(?P<day_4>(?={od})[0-9][0-9]?)..,\s*(?P<year_7>{ly})'
        )
        
        _pattern = '|'.join(
            i.format(
                sm=short_month_cap, lm=long_month_cap, snm=short_num_month_cap,sp_c=spl_char,
                lnm=long_num_month_cap, ld=long_day_cap, sd=short_day_cap,
                ly=long_year_cap, sy=short_year_cap, od=ordinal_day
            ) for i in formats
        )
        return _pattern
    
    def get_feature_names(self):
       return None
   
class SynonymTransformer(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def __init__(self, synonym_dict):
        self.syn_dict = synonym_dict
        #print(self.syn_dict)
        
    def fit(self, x, y = None):
        #print(self.syn_dict)
        return self

    def transform(self, text_col):
        #print(self.syn_dict)
        date_text_col = text_col.replace(self.syn_dict,regex=True)
        return date_text_col
        
    def get_feature_names(self):
       return None

class PunctTransformer(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, text_col):
        regexp = '['+string.punctuation+']{2,}'
        punct_text_col = text_col.replace(to_replace=re.compile(regexp,
                                                                flags = re.IGNORECASE),
                                             value='',inplace=False,regex=True)
        return punct_text_col
        
    def get_feature_names(self):
       return None

    
class FeaturizeDomainKeyWords(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def __init__(self, domain_keyword_list = []):
        self.keywords_list = domain_keyword_list
        #print(self.keywords_list)
    
    def fit(self, x, y = None):
        #print(self.keywords_list)
        return self

    def transform(self, text_col):
        #print(self.keywords_list)
        keyword_textcol_list = []
        if self.keywords_list != None:
            for text in text_col:
                keyword_textcol_dict = {}
                for keywords in self.keywords_list:
                    if(len(keywords) > 1):
                        keyword_reg = "|".join(keywords)
                    else:
                        keyword_reg = keywords[0]
                    keyword = 'has_keyword_' + keywords[0]
                    keyword_textcol_dict[keyword] = bool(re.search(keyword_reg,text,re.IGNORECASE))
                keyword_textcol_list.append(keyword_textcol_dict)
        return keyword_textcol_list
        
    def get_feature_names(self):
        keyword_col_list = ['has_keyword_' + keywords[0] for keywords in self.keywords_list]
        return keyword_col_list
    
def ClassDiscriminatingMeasure(X,y):
    CDM_tk =np.zeros(shape=(X.shape[1],))
#full_term_sum = tr_dsc_vect.tocsr().sum(0)
    for category in np.unique(y):
        #print(category)
        pos_loc = np.where(y == category)[0]
        cat_num_doc = len(pos_loc)
        #print(cat_num_doc)
        neg_loc = np.where(y != category)[0]
        neg_cat_num_doc = len(neg_loc)
        #print(neg_cat_num_doc)
        cat_term = X.tocsr()[pos_loc,:]
        #(nonzero_rows,nonzero_cols,_)=sparse.find(cat_term)
        tk_ci = np.diff(cat_term.tocsc().indptr)
        P_tk_ci = tk_ci / cat_num_doc
        #cat_term_sum = cat_term.sum(0)
        cat_term_neg = X.tocsr()[neg_loc,:]
        #cat_term_neg_sum = cat_term_neg.sum(0)
        #(nonzero_rows,nonzero_cols)=cat_term_neg.nonzero()
        tk_neg_ci = np.diff(cat_term_neg.tocsc().indptr)
        P_tk_neg_ci = (1 + tk_neg_ci)/ neg_cat_num_doc
        CDM_tk_ci = np.log1p(P_tk_ci/P_tk_neg_ci)
        CDM_tk = CDM_tk + CDM_tk_ci
    #print(CDM_tk.shape)
    return CDM_tk

def get_context_d_tk_w(d, tk, w = 3,token_regex = r"(?u)\b\w\w+\b"):
    #sentence = sentence.split()
    #d = re.split('[\s\-\:]+',d)
    r_splt = re.compile(token_regex)
    d = r_splt.findall(d)
    len_d = len(d)
    tk = tk.split()
    num_words = len(tk)
    r_st = re.compile(r"\b%s\b" % tk[0], re.IGNORECASE|re.MULTILINE)
    r_cmp = re.compile(r"\b%s\b" % ' '.join(tk), re.IGNORECASE|re.MULTILINE)
    for i,word in enumerate(d):
        if bool(r_st.match(word)) and \
        bool(r_cmp.match(' '.join(d[i:i+num_words]))):
            #print(i)
            #print(word)
            begin_pad = []
            end_pad = []
            if (i-w < 0):
                for b in reversed(range(0,w-i)):
                    begin_pad.append('__START_'+ str(b) +'__')
            #print(begin_pad)
            if (i+num_words+w > len_d):
                for e in range(0,i+num_words+w - len_d):
                    end_pad.append('__END_'+ str(e) +'__')
            #print(end_pad)
            start = max(0, i-w)
            #print(d[start:i+num_words+w])
            begin_pad.extend(d[start:i+num_words+w])
            #print(begin_pad)
            begin_pad.extend(end_pad)
            yield ' '.join(begin_pad)

def pairs(*lists):
    for t in itertools.combinations(lists, 2):
        for pair in itertools.product(*t):
            yield pair
            
def get_sim_context_d_tk_w(docs, tk, m_w = 3):
    sim_context_all_w = []
    for w in reversed(range(0,m_w + 1)):
        doc_contexts = []
        doc_contexts_itr = docs.apply(get_context_d_tk_w,args = (tk,w))
        doc_context_num = []
        for context in doc_contexts_itr:
            list_context = list(context)
            doc_context_num.append(len(list_context)) 
            doc_contexts.append(list_context)
        sim_context_w = []
        for x in pairs(*doc_contexts):
            sim_context_w.append(distance.get_jaro_distance(x[0],x[1]))
        sim_context_all_w.append(sim_context_w)
    sim_context_all_w = np.asarray(sim_context_all_w)
    sim_context_all_w = sim_context_all_w.sum(0)/ (m_w + 1)
    #range_list = []
    sim_context_d_tk_w = []
    for i in range(len(doc_context_num)):
        cr = 0
        range_list = []
        for pair in itertools.combinations(list(range(len(doc_context_num))),2): 
            #print(pair)
            last_pos = cr + (doc_context_num[pair[0]] * doc_context_num[pair[1]])
            all_pos = list(range(cr,last_pos))
            #print(all_pos)
            cr = last_pos      
            if i in pair:
                range_list.extend(all_pos)
        #range_list.append(tmp_range_list)
        sim_context_d_tk_w.append(sum(sim_context_all_w[range_list]))
    return(sim_context_d_tk_w)
       
def ClassDiscriminatingMeasureCS(X,y):
    CDM_tk =np.zeros(shape=(X.shape[1],))
#full_term_sum = tr_dsc_vect.tocsr().sum(0)
    for category in np.unique(y):
        #print(category)
        pos_loc = np.where(y == category)[0]
        cat_num_doc = len(pos_loc)
        #print(cat_num_doc)
        neg_loc = np.where(y != category)[0]
        neg_cat_num_doc = len(neg_loc)
        #print(neg_cat_num_doc)
        cat_term = X.tocsr()[pos_loc,:]
        #(nonzero_rows,nonzero_cols,_)=sparse.find(cat_term)
        tk_ci = cat_term.sum(0)
        P_tk_ci = tk_ci / cat_num_doc
        #cat_term_sum = cat_term.sum(0)
        cat_term_neg = X.tocsr()[neg_loc,:]
        #cat_term_neg_sum = cat_term_neg.sum(0)
        #(nonzero_rows,nonzero_cols)=cat_term_neg.nonzero()
        tk_neg_ci = cat_term_neg.sum(0)
        P_tk_neg_ci = (1 + tk_neg_ci)/ neg_cat_num_doc
        CDM_tk_ci = np.log1p(P_tk_ci/P_tk_neg_ci)
        CDM_tk = CDM_tk + CDM_tk_ci
    #print((CDM_tk.A1).shape)
    return  CDM_tk.A1

def get_sim_context_tk_w(terms,
                       count_vect_obj,
                       raw_document,
                       max_window = 3,
                       token_regex = r"(?u)\b\w\w+\b",
                       stop_words = None,
                       cache_dir = None):
                                           #'(?u)\\b\\w\\w+\\b'):
    cache_dict = {}
    is_cache = False
    cache_update = False
    if (cache_dir != None and os.path.isdir(cache_dir)):
        is_cache = True
        file_sign_str = (raw_document.str.cat(sep = ' ') + str(max_window)).encode(encoding = 'utf-8')
        hash_object = hashlib.md5(file_sign_str)
        cache_file_path = cache_dir + '/' + hash_object.hexdigest() + '.pkl'
        if os.path.isfile(cache_file_path):
            with open (cache_file_path, 'rb') as fp:
                cache_dict = pickle.load(fp)
    term_list = count_vect_obj.get_feature_names()
    raw_document.index = range(len(raw_document))
    #r_splt = re.compile("%s" % token_regex)
    data_lower = raw_document.str.lower().str.findall(token_regex)
    if stop_words != None:
        data_lower_stop = data_lower.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))
    else:
        data_lower_stop = data_lower
    nz_rows, nz_cols, nz_val = find(terms) #.nonzero()
    num_terms = terms.shape[1]
    ret_mat = lil_matrix(terms.shape)
    for term_idx in range(0,(num_terms-1)):
        #print(term_idx)
        term_doc_indx = nz_rows[np.where(nz_cols == term_idx)[0]]
        #nz_val[np.where(nz_cols == term_idx)[0]]
        if (len(term_doc_indx) == 1):
            ret_mat[term_doc_indx,term_idx] = 0 # should this be 1 instead as unique term
        else:
           tk =  term_list[term_idx]
           docs = data_lower_stop[term_doc_indx]
           if len(cache_dict) > 0 and tk in cache_dict:
               sim_context_d_tk_w = cache_dict[tk]
           else:    
               sim_context_d_tk_w = get_sim_context_d_tk_w(docs,tk,max_window)
               if is_cache:
                   cache_dict[tk] = sim_context_d_tk_w
                   cache_update = True
           for i,row_idx in enumerate(term_doc_indx):
               ret_mat[row_idx,term_idx] = sim_context_d_tk_w[i]
    
    if is_cache and cache_update:
        lock = filelock.FileLock("{}.lock".format(cache_file_path))
        try:
            with lock.acquire(timeout = 10):
                with open(cache_file_path, 'wb') as fp:
                    pickle.dump(cache_dict, fp)
        except lock.Timeout:
            print('update_cache timeout' + cache_file_path)
    #CDM_tk = ClassDiscriminatingMeasure(ret_mat,y,'sum')
    return ret_mat


class ContextSimilarityBasedFeatureSelection(CountVectorizer):
    def __init__(self,max_window = 3,
                 input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None,
                 lowercase=True, preprocessor=None, tokenizer=None,
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), analyzer='word',
                 max_df=1.0, min_df=1, max_features=None,
                 vocabulary=None, binary=False, dtype=np.int64,
                 percentile = 10,cache_dir = None):
        super(ContextSimilarityBasedFeatureSelection, self).__init__(input,
             encoding, decode_error, strip_accents, lowercase , preprocessor,
             tokenizer, stop_words, token_pattern, ngram_range ,
             analyzer, max_df, min_df, max_features, vocabulary, binary, 
             dtype)
        self.max_window = max_window
        self.token_pattern = token_pattern
        self.stop_words = stop_words
        #self.percentile = percentile
        self._red_dim = SelectPercentile(score_func=ClassDiscriminatingMeasureCS,
                                         percentile = percentile)
        self.cache_dir = cache_dir
        
    @property
    def percentile(self):
        return self._red_dim.percentile

    @percentile.setter
    def percentile(self, value):
        self._red_dim.percentile = value
    
    @property
    def score_func(self):
        return self._red_dim.score_func

    @score_func.setter
    def score_func(self, value):
        self._red_dim.score_func = value
# =============================================================================
#     def fit(self, raw_documents, y=None):
#         return self
# =============================================================================

    def fit_transform(self, raw_documents, y=None):
        dtm = super(ContextSimilarityBasedFeatureSelection, self).fit_transform(raw_documents)
        sim_context_tk_w = get_sim_context_tk_w(terms = dtm,
                       count_vect_obj = super(ContextSimilarityBasedFeatureSelection, self),
                       raw_document = raw_documents,
                       max_window = self.max_window,
                       token_regex = self.token_pattern,
                       stop_words = self.stop_words,
                       cache_dir = self.cache_dir)
        self._red_dim.fit_transform(sim_context_tk_w,y)
        self.selected_cols = self._red_dim.get_support(indices=True)
        return dtm[:,self.selected_cols]

    def transform(self, raw_documents, copy=True):
        #check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
        dtm = super(ContextSimilarityBasedFeatureSelection, self).transform(raw_documents)
        return dtm[:,self.selected_cols]
    
    def get_feature_names(self):
        all_features = super(ContextSimilarityBasedFeatureSelection, self).get_feature_names()
        return np.asarray(all_features)[self.selected_cols]

def classifaction_report_df(report):
    report = re.sub(r" +", " ", report).replace("avg / total", "avg/total").replace("\n ", "\n")
    report_df = pd.read_csv(StringIO("Classes" + report), sep=' ', index_col=0)        
    return(report_df)
# =============================================================================
#     report_data = []
#     lines = report.split('\n')
#     for line in lines[2:-3]:
#         row = {}
#         row_data = line.split('      ')
#         row['class'] = row_data[0]
#         row['precision'] = float(row_data[1])
#         row['recall'] = float(row_data[2])
#         row['f1_score'] = float(row_data[3])
#         row['support'] = float(row_data[4])
#         report_data.append(row)
#     dataframe = pd.DataFrame.from_dict(report_data)
# #    dataframe.to_csv('classification_report.csv', index = False)
#     return dataframe
# =============================================================================

def get_used_features(mod,explicit_feature_selection = True):
    if explicit_feature_selection:
        mod_support = mod.named_steps['feature_selection'].get_support(indices=True)
    features = []
    for trnf_list in mod.named_steps['union'].transformer_list:
        features.extend(trnf_list[1].named_steps['vect'].get_feature_names())
    if explicit_feature_selection:
        return(np.asarray(features)[mod_support])
    else:
        return(np.asarray(features))

def get_grid_values(gs_obj):
    means = gs_obj.cv_results_['mean_test_score']
    stds = gs_obj.cv_results_['std_test_score']
    col_name = ['means', 'stds']
    col_name.extend(list(gs_obj.cv_results_['params'][0].keys()))
    perf_df = pd.DataFrame(columns=col_name )    
    i = 0   
    for mean, std, params in zip(means, stds, gs_obj.cv_results_['params']):
# =============================================================================
#         print("%0.3f (+/-%0.03f) for %r"
#                   % (mean, std * 2, params))
# =============================================================================
        row_list = [mean,std * 2]   
        row_list.extend(params.values())
        perf_df.loc[i] = row_list
        i += 1
    return perf_df

def confusion_matrix_df(y_actu,y_pred):
    y_actu = pd.Series(y_actu, name='Actual')
    y_pred = pd.Series(y_pred, name='Predicted')
    #return np.array2string(confusion_matrix, separator=', ')
    df_confusion = pd.crosstab(y_actu, y_pred, 
                               rownames=['Actual'], colnames=['Predicted'], 
                               margins=True)
    return df_confusion


class BodyExtraction(BaseEstimator, TransformerMixin):
    
    def __init__(self, body_extract='Pick_Full_Mail',set_num_words=100):
        #pick_First_Mail
        #Pick_Words
        #Pick_Full_Mail
        self.body_extract = body_extract
        self.set_num_words=set_num_words
        #print(self.keywords_list)
    
    def fit(self, x, y = None):
        #print(self.keywords_list)
        return self

    def transform(self, text_col):
        #print(self.keywords_list)
        
        
        text_col = text_col.map(lambda x:self.clean(x))
        
        
        #sentence extraction or replacing to,cc 
        text_col = text_col.replace(to_replace=re.compile(r'To:.*?(?=SEPARATOR)|Cc:.*?(?=SEPARATOR)|Subject:.*?(?=SEPARATOR)|Sent:.*?(?=SEPARATOR)',flags = re.IGNORECASE),
                    value='',inplace=False,regex=True)
        
        
        # removing disclaimers
        text_col = text_col.replace(to_replace=re.compile(r'this email and the document[\w\W]+this message is intended solely for the use[\w\W]+|//na01.safelinks.protection.outlook.com[\w\W]+|note:? this is a system generated email[\w\W]+|this email transmission, and any documents, files or previous email messages[\w\W]+|this email  including attachments  is confidential[\w\W]+|this email  including attachments  is confidential[\w\W]+|notice: this email and any attachments are for the exclusive and confidential[\w\W]+|the information in this email is confidential[\w\W]+|this message and its attachments are intended for the exclusive[\w\W]+|this email and any files transmitted with it are confidential and intended[\w\W]+|kind este mensaje[\w\W]+|the contents of this email message and any attachments are intended solely[\w\W]+|this message and its attachments are the property of aerovías de méxico[\w\W]+|this is an automated response.for any assistance [\w\W]+|confidentiality notice this communication may contain privileged or confidential information[\w\W]+|the contents of this email and any attachments may contain confidential information[\w\W]+|this email message and any attachments are for the use of the intended recipients[\w\W]+|as informações contidas nesta mensagem são confidenciais[\w\W ]+',flags = re.IGNORECASE),
                    value='',inplace=False,regex=True)
        
        text_col=text_col.map(lambda x:self.remove_Non_Word(x))
        
        text_col=text_col.map(lambda x:self.remove_Repeated_Words(x))
    
        ##############remove salutations#################
        sal_regex=self.salutations_regex()
        text_col = text_col.replace(to_replace=re.compile(sal_regex,flags = re.IGNORECASE),
                 value='',inplace=False,regex=True)
        
        if(self.body_extract=='pick_First_Mail'):
            
            text_col=text_col.map(lambda x: ' '.join(x.split('THREADCHAINBREAK')[0:1]))
            
            transformed_col=text_col.replace(to_replace=re.compile(r'\s+',flags = re.IGNORECASE),
                    value=' ',inplace=False,regex=True)
            
            return transformed_col
            
        elif(self.body_extract=='Pick_Words'):
            
            transformed_col = text_col.replace(to_replace='THREADCHAINBREAK',
                 value=' ',inplace=False,regex=True)
            transformed_col=transformed_col.replace(to_replace=re.compile(r'\s+',flags = re.IGNORECASE),
                    value=' ',inplace=False,regex=True)
            
            return transformed_col.map(lambda x:' '.join(x.split(' ')[0:self.set_num_words]))
        
        elif(self.body_extract=='Pick_Full_Mail'):
            
            transformed_col=text_col.replace(to_replace='THREADCHAINBREAK',value=' ',inplace=False,regex=True)
            return transformed_col.replace(to_replace=re.compile(r'\s+',flags = re.IGNORECASE),
                    value=' ',inplace=False,regex=True)
        
    def clean(self,text):
        text = text.replace('\r' , '')
        text = text.replace('\t' , '')
        text = text.replace('?','').replace('-','')
        text = text.replace('\xa0',' ')
        text = text.replace('\n','SEPARATOR')
        #text = re.sub('\W' ,' ', text)
        text =  re.sub('\w+@\w+.domain','email_set', text)
        text = re.sub('<http://[\w//.]+>' ,' url_set', text)
        text = re.sub('http://[\w.]+.com' ,'', text)
        text = re.sub('<html[\w/\"<>=., ]+' ,'', text)
        text = re.sub('Disclaimer:[a-zA-Z0-9_,(). ]+' ,'', text)
        text = re.sub('www.[\w]+.com' ,'', text)
    #        text = re.sub('Date:' ,'Sent:', text)
        text = re.sub('CC:' ,'Cc:', text)
        text = re.sub('Expéditeur:' ,'From:', text)
        text = re.sub('De:' ,'From:', text)
        text = re.sub('Enviado el:' ,'Sent:', text)
        text = re.sub('Asunto:' ,'Subject:', text)
        text = re.sub('Enviada em:' ,'Sent:', text)
        text = re.sub('Assunto:' ,'Subject:', text)
        
        text = re.sub(' url_set' ,'', text)
        text = text + ' From:'
        text=re.sub(r'From:.*?(?=SEPARATOR)','THREADCHAINBREAK',text)
        
        return text
    
    def remove_Non_Word(self,docs):
        docs = re.sub(r'cid:email_set','',docs)
        docs = re.sub(r'mailto:email_set','',docs)
        docs = re.sub(r'email_set','',docs)
        docs = re.sub(r'description:','',docs)
        docs = re.sub(r'original message','',docs)
        docs = re.sub(r'[\*\[\]_<>/#|!=\-\&;\?^""]','',docs)
        docs = re.sub(r'\s+', ' ', docs)
        if docs.startswith('clientname internal'):
            docs = docs.replace('clientname internal' , '')
        docs = re.sub(r"\'", '', docs)
        a = re.search(r'good (morning|afternoon)' , docs)
        if a is not None: 
            docs = docs.replace(a.group(),'')
        if docs.startswith('dear'):
            docs = re.sub(r'dear','', docs)
        docs = re.sub(r'(hi,|hello|hi )','',docs)
        #docs = re.sub(r',',' ',docs)
        docs = docs.strip()
        return docs
    
    def remove_Repeated_Words(self,docs):
        docs = re.sub('importance:|cc:|re:|:','',docs)
        docs = re.sub('\.+','.',docs)
        docs = re.sub('SEPARATOR' , ' ',docs)
        docs = re.sub('\s+',' ',docs)
        docs = docs.strip()
        docs=docs.strip(' From')
        return docs
    
    def salutations_regex(self):
        
        salutations=['Mit freundlichen Gruben','Thank you for your assistance','Thanks and Regards','Many thanks in advance','Thanks a lot',
         'Thanks for the support','Thanks for your help','Thanks for all','Thank you for your support','Thanks in advance','Thanks so much','Thanks again',
         'Thank you','Thanks you','Thanks & Regards','Many thanks','Best','Thank you in advance for your help',
         'Thx','TKS','Thanks','Kind regards','Best regards','Cordialement','Saludos',
         'Disclaimer','Sincerely','Brgds','With Regards','Krgds','Regards','Have a Wonderful Day',
         'Please do not respond to this message','Visit the exciting new options','Rgds',
         'Caution','This e-mail message and any attachment(s)','Please hurry']
        
        return '|'.join(salutations)


In [2]:
##Predefined packages needed
import argparse
import configparser
import re
import csv
import pandas as pd
import numpy as np
import datetime
import pickle
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support,classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
##from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
#from sklearn.decomposition import NMF
from sklearn.metrics import make_scorer,f1_score
from sklearn.feature_selection import SelectPercentile, chi2, SelectFromModel, f_classif
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier

  from numpy.core.umath_tests import inner1d


In [22]:
portfolio_name = 'MYORG'
file_encoding = 'utf8'
test_size_param =0.2
val_size_param = 0.2
body_extract_param = 'Pick_Full_Mail'
##choices will be (‘pick_First_Mail’,’Pick_Words’,’ Pick_Full_Mail’)
   
##Predefined Model Hyper Parameters to be part of config file
#N_FEATURES_OPTIONS = [2, 4, 8]
N_FEATURES_chi = [30, 20, 10]
##C_OPTIONS = [1, 10, 100, 1000]
NUMBER_OF_ESTIMATORS_RF = [ 80]
stknb_alpha_param = [0.01]
ngram_range_param_subj = [(1, 2)]
ngram_range_param_desc = [(1, 2)]
alpha_param = [1e-2]
true_false_param = [True, False]
##log_C = (1,10)
log_C = [1]
##max_df_param = (0.25, 0.5, 0.75)
max_df_param = [0.75]
min_df_param = [0.01]

In [23]:
output_dir = 'D:/Data Science/POC/Email Classification Product/debug'
synonym_file_path = 'D:/Data Science/POC/Email Classification Product/csv/synonyms.csv'
domain_keyword_file_path = 'D:/Data Science/POC/Email Classification Product/csv/domain_key_words.csv'
stop_words_file = 'D:/Data Science/POC/Email Classification Product/csv/stop_words_not_masked.csv'
from sklearn.feature_extraction import text
if (stop_words_file != ''):
        my_additional_stop_words = re.split('\n|\t', open(stop_words_file).read())
        my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
else:
        my_stop_words = text.ENGLISH_STOP_WORDS

In [24]:
data_file_path = "D:/Data Science/POC/Email Classification Product/Email_Final_Input_Desc_Dedup.csv"
data_df = pd.read_csv(data_file_path,encoding=file_encoding) 
print("DATA INGESTION COMPLETED")
data_df.dropna(inplace=True)
target_col = 'Text.Category'
##training_subject_flag = 'Y'
if 'Text.Subj' in data_df.columns.values:
            training_col = ['Text.Body','Text.Subj']
else: 
            training_col = ['Text.Body']

print(data_df[target_col].value_counts())
        ###Split the  Data into Training /Test and Validation for usage
training_val, test = train_test_split(data_df,
                                                    test_size = 0.2,
                                                    stratify=data_df[target_col])
training, validation = train_test_split(training_val,
                                                    test_size = 0.2,
                                                    stratify=training_val[target_col])

now = datetime.datetime.now()
        ###SAVE THE FILES TO O/P DIRECTORY
training.to_csv(output_dir + '/' + portfolio_name + '_training_' + now.strftime("%Y-%m-%d") + ".csv",
                                   index  = False, )
validation.to_csv(output_dir + '/' + portfolio_name + '_validation_' + now.strftime("%Y-%m-%d") + ".csv",
                                   index  = False, )
test.to_csv(output_dir + '/' + portfolio_name + '_test_' + now.strftime("%Y-%m-%d") + ".csv",
                                   index  = False, )
X_cat = training[training_col]
Y_cat = training[target_col]

DATA INGESTION COMPLETED
Shipment Status                               425
For your information cases                    279
Expedite request                               68
Internal Team Request to order status team     40
Quote status                                   28
Name: Text.Category, dtype: int64


In [25]:
pipeline_featureunion_list_desc = []
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 
    ##Part of config file
    
    
trim_words_param = 100
    
if synonym_file_path!= '':    
        with open(synonym_file_path, mode='r') as infile:
            reader = csv.reader(infile)
            syn_dict = {rows[0]:rows[1] for rows in reader}
        pipeline_featureunion_list_desc.append(('content_desc', Pipeline([
                                        ('selector', ItemSelector(key='Text.Body')),
                                        ('preprocessing', BodyExtraction(body_extract=body_extract_param,set_num_words=trim_words_param)),
                                        ('datetrns', DateTransformer()),
                                        ('numtrns', NumberTransformer()),
                                        ('puntrns', PunctTransformer()),
                                        ('synonyms',SynonymTransformer(syn_dict)),
#                                        ('vect', CountVectorizer(stop_words = my_stop_words)),
                                        ('vect', TfidfVectorizer(stop_words = my_stop_words)),
#                                        ('tfidf', TfidfTransformer()),
                                ])))
else:
        pipeline_featureunion_list_desc.append(('content_desc', Pipeline([
                                            ('selector', ItemSelector(key='Text.Body')),
                                            ('preprocessing', BodyExtraction(body_extract=body_extract_param,set_num_words=trim_words_param)),
                                            ('datetrns', DateTransformer()),
                                            ('numtrns', NumberTransformer()),
                                            ('puntrns', PunctTransformer()),
#                                            ('vect', CountVectorizer(stop_words = my_stop_words)),
                                            ('vect', TfidfVectorizer(stop_words = my_stop_words)),
#                                            ('tfidf', TfidfTransformer()),
                                    ]))) 
pipeline_featureunion_list_desc.append(('content_stats', Pipeline([
        ('selector', ItemSelector(key='Text.Body')),                            
        ('stats', TextStats()),  # returns a list of dicts
                                    ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                                    ('scale',StandardScaler(with_mean=False)),
                                ])))
    
     ###CREATE SUBJECT PIPELINE
if 'Text.Subj' in data_df.columns.values:
        if synonym_file_path!= '':    
            with open(synonym_file_path, mode='r') as infile:
                reader = csv.reader(infile)
                syn_dict = {rows[0]:rows[1] for rows in reader}
            pipeline_featureunion_list_desc.append(('content_subj', Pipeline([
                                            ('selector', ItemSelector(key='Text.Subj')),
                                            ('preprocessing', BodyExtraction(body_extract=body_extract_param,set_num_words=trim_words_param)),
                                            ('datetrns', DateTransformer()),
                                            ('numtrns', NumberTransformer()),
                                            ('puntrns', PunctTransformer()),
                                            ('synonyms',SynonymTransformer(syn_dict)),
                                            ('vect', CountVectorizer(stop_words = my_stop_words)),
#                                            ('tfidf', TfidfTransformer()),
                                    ])))
        else:
            pipeline_featureunion_list_desc.append(('content_subj', Pipeline([
                                                ('selector', ItemSelector(key='Text.Subj')),
                                                ('preprocessing', BodyExtraction(body_extract=body_extract_param,set_num_words=trim_words_param)),
                                                ('datetrns', DateTransformer()),
                                                ('numtrns', NumberTransformer()),
                                                ('puntrns', PunctTransformer()),
                                                ('vect', CountVectorizer(stop_words = my_stop_words)),
#                                                ('tfidf', TfidfTransformer()),
                                        ]))) 
        pipeline_featureunion_list_desc.append(('content_sub_stats', Pipeline([
            ('selector', ItemSelector(key='Text.Subj')),                            
            ('stats', TextStats()),  # returns a list of dicts
                                        ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                                        ('scale',StandardScaler(with_mean=False)),
                                    ])))
        
        
pipeline_features_mnb = Pipeline([('union', FeatureUnion(
                            transformer_list=pipeline_featureunion_list_desc,
                            )),
                             #('scale',StandardScaler(with_mean=False)),
                             ('feature_selection', SelectFromModel(LinearSVC(penalty="l1",dual=False))),
                             ('clf', MultinomialNB())])
    
text_clf_mnb = pipeline_features_mnb.fit(X_cat, Y_cat)
    
print("PIPELINE FIT COMPLETED")
print(pipeline_features_mnb.get_params())

PIPELINE FIT COMPLETED
{'memory': None, 'steps': [('union', FeatureUnion(n_jobs=1,
       transformer_list=[('content_desc', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='Text.Body')), ('preprocessing', BodyExtraction(body_extract='Pick_Full_Mail', set_num_words=100)), ('datetrns', DateTransformer()), ('numtrns', NumberTransformer()), ('puntrns', PunctTransformer()), (...rue,
        sparse=True)), ('scale', StandardScaler(copy=True, with_mean=False, with_std=True))]))],
       transformer_weights=None)), ('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0),
        norm_order=1, prefit=False, threshold=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))], 'union': FeatureUnion(n_jobs=1,
       transformer_list=[('content_desc', Pipelin

In [26]:
print(X_cat.shape)
print(Y_cat.shape)
print(X_cat.head())
print(data_df.columns.values)

(537, 1)
(537,)
                                             Text.Body
415  ______________________________________________...
317  Pham - we had agreed to honor the $2761 price ...
255  Hi Sir/Madam, \r\r\n\r\r\nWe placed a new PO L...
777  Sure Allen, let me purchase and will get back ...
659                                   IDOC#XXXXXXXXXX 
['Case.Number' 'Text.Category' 'Text.Body']


In [27]:
if 'Text.Subj' in data_df.columns.values:
    parameters_nb = [{
                                'union__content_desc__vect':  [CountVectorizer(stop_words = my_stop_words),
                                             StemmedCountVectorizer(stop_words = my_stop_words)],
                                'union__content_desc__vect__ngram_range': ngram_range_param_desc,
                                'union__content_desc__vect__max_df': max_df_param,
                                'union__content_desc__vect__min_df': min_df_param,
#                                'union__content_desc__tfidf__use_idf': true_false_param,
                                'union__content_subj__vect':  [CountVectorizer(stop_words = my_stop_words),
                                             StemmedCountVectorizer(stop_words = my_stop_words)],
#                                'union__content_subj__vect__ngram_range': ngram_range_param_subj,
#                                'union__content_subj__vect__max_df': max_df_param,
#                                'union__content_subj__vect__min_df': min_df_param,
#                                'union__content_subj__tfidf__use_idf': true_false_param,
                                'clf__alpha': alpha_param
                            },
                            {
                                'feature_selection': [SelectPercentile(chi2)],
                                'feature_selection__percentile': N_FEATURES_chi
                            },
                            {
                                'feature_selection': [SelectFromModel(ExtraTreesClassifier())],
                                'feature_selection__estimator__n_estimators' : NUMBER_OF_ESTIMATORS_RF
    
                            },
                            {   'feature_selection': [SelectPercentile(f_classif)],
                                'feature_selection__percentile': N_FEATURES_chi
    
                            },
                            {
                                'feature_selection': [SelectFromModel(LinearSVC(penalty="l1",dual=False, C =0.01))],
                            }]
    
else:
    parameters_nb = [{
                                'union__content_desc__vect':  [CountVectorizer(stop_words = my_stop_words),
                                             StemmedCountVectorizer(stop_words = my_stop_words)],
                                'union__content_desc__vect__ngram_range': ngram_range_param_desc,
                                'union__content_desc__vect__max_df': max_df_param,
                                'union__content_desc__vect__min_df': min_df_param,
#                                'union__content_desc__tfidf__use_idf': true_false_param,
                                'clf__alpha': alpha_param
                            },
                            {
                                'feature_selection': [SelectPercentile(chi2)],
                                'feature_selection__percentile': N_FEATURES_chi
                            },
                            {
                                'feature_selection': [SelectFromModel(ExtraTreesClassifier())],
                                'feature_selection__estimator__n_estimators' : NUMBER_OF_ESTIMATORS_RF
    
                            },
                            {   'feature_selection': [SelectPercentile(f_classif)],
                                'feature_selection__percentile': N_FEATURES_chi
    
                            },
                            {
                                'feature_selection': [SelectFromModel(LinearSVC(penalty="l1",dual=False, C =0.01))],
                            }]
    
    
        
    ###instance of the grid search by passing the classifier, parameters 
    # and n_jobs= -1 which tells to use multiple cores from user machine.


STEP2


In [28]:
print(parameters_nb)

[{'union__content_desc__vect': [CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'', 'even', 'must', 'detail', 'urlhttp', 'yours', 'if', 'in', 'and ', 'between', 'email', 'into', 'across', 'forty', 'three', 'httpsaerospaceclientnamecom', 'never', 'july', 'co', 'becomes', 'moreover', 'under', 'would', 'aerohttp', 'namely', 'thin', 'whereby', 'seem', 'call', ...clientnam', 'although', 'afterwards', 'six', 'became', 'when', 'why', 'formerly', 'very', 'ponopo'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None), StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max

In [29]:
f_scorer = make_scorer(f1_score, average = 'weighted')
gs_clf_mnb = GridSearchCV(text_clf_mnb, parameters_nb, scoring = f_scorer,
                              cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=32))
    
    
gs_clf_mnb = gs_clf_mnb.fit(X_cat, Y_cat)
    
print("GRID SEARCH FIT completed for Naive Bayes")
    ##Multinomial Naive Bayes estimators
mod1 = gs_clf_mnb.best_estimator_

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, wa

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / s

  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.r

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', '

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / s

  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  'precision', 'predicted', average, warn_for)
  return np.add.r

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GRID SEARCH FIT completed for Naive Bayes


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [34]:
Metrics = pd.DataFrame()
Metrics['Models'] = ['NaiveBayes', 'XGB', 'StackedModel']
Metirics['Models']

import pandas as pd
data = [{'Model_type': NaiveBayes, 'Model_Validation_Accuracy': 2, 'Model_Validation_Precision':3, 'Model_Validation_Recall':4},
        {'Model_type': XGB, 'Model_Validation_Accuracy': 2, 'Model_Validation_Precision':3, 'Model_Validation_Recall':4}
        {'Model_type': StackedNBXGBoost, 'Model_Validation_Accuracy': 2, 'Model_Validation_Precision':3, 'Model_Validation_Recall':4}]
= pd.DataFrame(data)
print df

In [None]:
def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-1] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T
