# text classifier #

In [1]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
binary classification with classes in separate directories in root directory
- logistic regresssion, feature engineering, vector space optimization, visualization

"""
from __future__ import division
import io, os, random, re
import numpy as np
from pandas import DataFrame
from unidecode import unidecode

def read_files(path, SPLITCHAR = '\n\n', normalization = False):
    paragraphs_ls, filenames_ls = [], []
    for (root, dirnames, filenames) in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(root,filename)
            with io.open(filepath, 'r', encoding = 'utf-8') as f:
                text = f.read()
                paragraphs = text.split(SPLITCHAR)
                del paragraphs[0]
                i = 0
                for paragraph in paragraphs:
                    paragraph = paragraph.rstrip()
                    if paragraph:
                        if normalization:
                            paragraph = re.sub(r'\W+',' ', paragraph)
                            paragraph = re.sub(r'\d','',paragraph)
                            paragraph = re.sub(r'  +',' ',paragraph)
                            paragraph = unidecode(paragraph.lower())
                        paragraphs_ls.append(paragraph)
                        filenames_ls.append(filename+'_'+str(i))
                        i += 1
    return filenames_ls, paragraphs_ls

def make_df(path, classification):
    filenames, paragraphs = read_files(path, normalization = True)
    rows = []
    idx = []
    i = 0
    for paragraph in paragraphs:
        rows.append({'text': paragraph, 'class': classification})
        idx.append(filenames[i])
        i += 1
    df = DataFrame(rows, index = idx)
    return df

# change to workgin directory
os.chdir(os.path.expanduser('~/Documents/edu/bootcamps/tm_bootcamp'))

### generate data
OT = 'old_testament'
NT = 'new_testament'

SRCS = [('DATA/kjv_books/ot',OT),('DATA/kjv_books/nt',NT)]

data = DataFrame({'text': [], 'class': []})

for path, classification in SRCS:
    data = data.append(make_df(path, classification))

print data.head()
print data.tail()
print data.shape

# unbias
def printdist(df):
    """
    Data-specific function for printing distributions of binary classification data
    """
    print 'class distribution: ', OT, sum(df['class'] == OT), NT, sum(df['class'] == NT)

printdist(data)

def unbias_data(df, n):
    random.seed(1234)
    res = DataFrame({'text': [], 'class': []})
    C = list(set(df['class']))
    for c in C:
        idx = df[df['class'] == c].index.tolist()
        df_c = df.loc[random.sample(idx, n)]
        res = res.append(df_c)
    return res.reindex(np.random.permutation(res.index))


data_800 = unbias_data(data, 800)

printdist(data_800)

# split data
ratio = 0.8
mask = np.random.rand(len(data_800)) <= ratio
data_train = data_800[mask]
data_test  = data_800[~mask]

data_train.shape
data_test.shape

## features (text) og repsponse (class)
# training
train_X = data_train['text'].values
train_y = data_train['class'].values
# test
test_X = data_test['text'].values
test_y = data_test['class'].values

### model training
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV

# vectorizer
vec = CountVectorizer()
train_feat = vec.fit_transform(train_X)

feat_names = vec.get_feature_names()

# classifier
clf = LogisticRegressionCV()
clf.fit(train_feat, train_y)

### model validation
test_feat = vec.transform(test_X)
pred = clf.predict(test_feat)

# performance metrics
confmat = metrics.confusion_matrix(test_y, pred)
perf_acc = metrics.accuracy_score(test_y, pred)

# model summary
print metrics.classification_report(test_y,pred)

# most infomrative features
coef_feat_names = sorted(zip(clf.coef_[0], feat_names))



                      class                                               text
Esther.txt_0  old_testament   now it came to pass in the days of ahasuerus ...
Esther.txt_1  old_testament   on the seventh day when the heart of the king...
Esther.txt_2  old_testament   then the king said to the wise men which knew...
Esther.txt_3  old_testament   after these things when the wrath of king aha...
Esther.txt_4  old_testament   now in shushan the palace there was a certain...
                        class  \
Hebrews.txt_15  new_testament   
Hebrews.txt_16  new_testament   
Hebrews.txt_17  new_testament   
Hebrews.txt_18  new_testament   
Hebrews.txt_19  new_testament   

                                                             text  
Hebrews.txt_15   let brotherly love continue be not forgetful ...  
Hebrews.txt_16   remember them which have the rule over you wh...  
Hebrews.txt_17   pray for us for we trust we have a good consc...  
Hebrews.txt_18   salute all them that have the rule ove

## application of ELI-5



In [2]:
import eli5

eli5.show_weights(clf, vec = vec, top = 25)

Weight?,Feature
+4.583,<BIAS>
+0.954,land
+0.939,king
+0.842,israel
… 3601 more positive …,… 3601 more positive …
… 3492 more negative …,… 3492 more negative …
-0.850,love
-0.867,believed
-0.898,john
-0.936,but


In [3]:
i = 2
text = data_800['text'][i]
print data_800['class'][i]

eli5.show_prediction(clf, text, vec = vec)

old_testament


Contribution?,Feature
4.583,<BIAS>
1.361,Highlighted in text (sum)
