### Preprocessing

In [1]:
import os, os.path
import numpy as np
import gensim
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_punctuation
import pickle as pkl
import itertools
from sklearn.model_selection import train_test_split
import io
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from textblob import TextBlob
import numpy as np
from nltk.corpus import stopwords
from textblob import Word
#nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')

Using TensorFlow backend.


In [2]:
# strip punctuation and remove stopwords
def preprocess(sentence_list):
    filters = [lambda x : x.lower(), strip_punctuation, remove_stopwords]
    sentence_token=preprocess_string(''.join(sentence_list),filters)
    return sentence_token


In [3]:
#Build topic model 
def topic_model(total_corpus, NUM_TOPICS):
    dictionary = gensim.corpora.Dictionary(total_corpus)
    doc2bow_corpus = [dictionary.doc2bow(words) for words in total_corpus]
    ldamodel = gensim.models.ldamodel.LdaModel(doc2bow_corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    answer = []
    for doc in doc2bow_corpus:
        answer.append(ldamodel.get_document_topics(doc))
    matrix = []
    for line in answer:
        weight_list = []
        for i in range(NUM_TOPICS):
            tup = [item for item in line if i in item]
            if len(tup) == 0:
                    weight_list.append(0)
            else:
                weight_list.append(float(tup[0][1]))
        matrix.append(weight_list)
    matrix = np.asarray(matrix) 
    return matrix
 

In [6]:
#Build matrix of topic models for texts and subjects
def build_topic_matrix(model, total_corpus, NUM_TOPICS):
    save_path='/Users/Hannah/ML_project/Enron/'
    if model=='texts_topic':
        if 'topic_matrix_50.npy' not in [f for f in os.listdir(save_path)]:
            print('Train text topic model...')
            matrix=topic_model(total_corpus,  NUM_TOPICS)
            np.save(save_path+'topic_matrix_50.npy', matrix)
        else:
            print('Load text topic model...')
            matrix= np.load(save_path+'topic_matrix_50.npy')
            
    elif model=='subject_topic':
        if 'subject_topic_10_matrix.npy' not in [f for f in os.listdir(save_path)]:
            print('Train subject topic model...')
            matrix=topic_model(total_corpus, NUM_TOPICS)
            np.save(save_path+'subject_topic_matrix.npy', matrix)
        else:
            print('Load subject topic model...')
            matrix= np.load(save_path+'subject_topic_10_matrix.npy')


    return matrix

In [7]:
#Calculate how many sequences are in a text
def sequence_num(docs):
    return [[len(sent_tokenize(x))] for x in docs]
        

In [8]:
#Calculate how many token with stopwords are in a text
def token_num(docs):
    return [[len( word_tokenize(x))] for x in docs]

In [9]:
#Calculate how many token without stopwords are in a text
def token_num_wo_stopword(docs):
    seq=[preprocess(x) for x in docs]
    return [[len(words)] for words in seq]

In [10]:
#Calculate how many special characters are in a text
def special_char_num(dataset):
    ave_s_char_num=[]
    s_char_num=[]
    for seq in dataset:
        text=' '.join(seq)
        
        st = ["#", "$", "%", "&", "(", ")", "*", "+", "-", "/", "<", "=", '>',
              "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
        count = 0
        for i in text:
            if (i in st):
                count = count + 1
        ave_s_char_num.append([count / len(text)])
        s_char_num.append([count])
    
    return ave_s_char_num, s_char_num


In [11]:
#Calculate how many puncuation characters are in a text
def puncuation_char_num(dataset):
    ave_p_char_num=[]
    p_char_num=[]
    for seq in dataset:
        text=' '.join(seq)
        st = [",", ".", "'", "!", '"', ";", "?", ":", ";"]
        count = 0
        for i in text:
            if (i in st):
                count = count + 1
        ave_p_char_num.append([count / len(text)])
        p_char_num.append([count])
    
    return ave_p_char_num, p_char_num

In [12]:
#Calculate how many functional words are in a text
def functional_words(docs):
    functional_words = """a between in nor some upon
    about both including nothing somebody us
    above but inside of someone used
    after by into off something via
    all can is on such we
    although cos it once than what
    am do its one that whatever
    among down latter onto the when
    an each less opposite their where
    and either like or them whether
    another enough little our these which
    any every lots outside they while
    anybody everybody many over this who
    anyone everyone me own those whoever
    anything everything more past though whom
    are few most per through whose
    around following much plenty till will
    as for must plus to with
    at from my regarding toward within
    be have near same towards without
    because he need several under worth
    before her neither she unless would
    behind him no should unlike yes
    below i nobody since until you
    beside if none so up your
    """
    sum_functional_words=[]
    ave_functional_words=[]
    
    functional_words = functional_words.split()
    
    for text in docs:
        words = strip_punctuation(text)
        count = 0
        for i in word_tokenize(words):
            if i in functional_words:
                count += 1
        ave_functional_words.append([count / (len(words)+1)])
        sum_functional_words.append([count])
    return ave_functional_words, sum_functional_words

In [13]:
#Calculate how many non-repeating words are in a text
def set_word(docs):
    ave_set_words=[]
    set_words=[]
    for text in docs:
        words = strip_punctuation(text)
        token = word_tokenize(words)
        set_words .append([len(set(token))])

    return set_words

In [14]:
#Load subjects and texts 
path='/Users/Hannah/ML_project/Enron/data/'

dataset=[]
class_name=[] 
count_labels=[]  
subject_dataset=[]
for i, names in enumerate(os.listdir(path)):   
    class_name.append(names)
    count_labels.append([i]*len(os.listdir(path+names+'/text/')))
    for texts in os.listdir(path+names+'/text/'):
        with open(path+names+'/text/'+texts) as file:
            title='.'.join(texts.split('.')[:2])
            dataset.append(file.readlines())   
        with open(path+names+'/subject/'+title+'.JavaMail.evans@thyme.subject') as subject_file:
            subject_dataset.append(subject_file.readlines())  
        
labels=list(itertools.chain(*count_labels)) 
num_labels=[len(x) for x in count_labels]
accum_labels=np.cumsum([len(x) for x in count_labels])

In [15]:
#Build corpus
docs=[]
for sentence_list in dataset:
    docs.append(' '.join(sentence_list))
    
total_corpus=[]
for sentence_list in dataset:
    total_corpus.append(preprocess(sentence_list))

subject_total_corpus=[]
for sentence_list in subject_dataset:
    subject_total_corpus.append(preprocess(sentence_list))

In [16]:
#Build Topic models for texts and subjects
NUM_TEXT_TOPICS=50
NUM_SUBJECT_TOPICS=10

    
topic_50_matrix=build_topic_matrix('texts_topic',total_corpus, NUM_TEXT_TOPICS)
subject_topic_50_matrix=build_topic_matrix('subject_topic',subject_total_corpus, NUM_SUBJECT_TOPICS)

Loading text topic model...
Training subject topic model...


In [17]:
#Generate statistics about the texts
seq_num = sequence_num(docs)
tok_num = token_num(docs)
tok_num_wo_stopw = token_num_wo_stopword(docs)
ave_s_char_num, s_char_num=special_char_num(dataset)
ave_p_char_num, p_char_num=puncuation_char_num(dataset)
ave_functional_words, sum_functional_words =functional_words(docs)
set_words=set_word(docs)

In [18]:
#Combine statistics
X=np.concatenate((seq_num, tok_num, tok_num_wo_stopw, ave_s_char_num, s_char_num, ave_p_char_num, p_char_num,
                ave_functional_words, sum_functional_words, set_words, topic_50_matrix, subject_topic_50_matrix), axis=1)

In [19]:
#Split data to training set and validation
X_train_feature, X_valid_feature, y_train, y_valid = train_test_split(X, labels, test_size=0.3, stratify=labels, random_state=66)
X_train, X_valid, y_train, y_valid = train_test_split(docs, labels, test_size=0.3, stratify=labels, random_state=66)

#Calculate TFIDF
tfidf_vectorizer= TfidfVectorizer( max_features=3000, min_df=3, max_df=0.7, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer .fit_transform(X_train)
X_valid_tfidf=tfidf_vectorizer.transform(X_valid)

#Comnine training set and validation
X_train_all=np.concatenate((X_train_feature,X_train_tfidf.toarray()), axis=1)
X_valid_all=np.concatenate((X_valid_feature,X_valid_tfidf.toarray()), axis=1)

### Model

In [None]:
#Wrap the data in dmatrix
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_valid)

In [None]:
#Set params
params = {
"objective":'multi:softmax',
'num_class': 148,
"booster" : "dart",
"eval_metric": "auc",
"eta": 0.04,
"tree_method": 'exact',
"max_depth": 7,
"subsample": 0.07,
"colsample_bytree": 0.8,
"silent": 0,
"alpha" :0.01,
        }

In [None]:
#Build XGBoost classifier
xgb_clf = xgb.train(params, dtrain, num_boost_round=30)
y_pred = xgb_clf.predict(dtest)

In [None]:
#Use f1 score as the evaluation metrics
f1_score(y_valid, y_pred, average='micro') 

In [None]:
#Print the classification report
print(classification_report(y_valid, y_pred, target_names=class_name))