# Document Classification #

Code sample for basic document classification using scikit-learn machine learning library for Python 2

In [1]:
# core python
import io, os

# string management
import re
from unidecode import unidecode

# data management
from pandas import DataFrame

### function 1: walk directory

In [5]:
def read_dir(path, SPLITCHAR = '\n', NORM = False):
    """ get paragraphs from unicode documents in subdirectories of root directory on path (walk to subsub directories)
    - SPLITCHAR: escape sequences for segment (default paragraph)
    - NORM: normalization optional (remove anything but alphabetic characters and decode unicode as ascii)
    """
    paragraphs_ls, filenames_ls = [], []
    for (root, dirnames, filenames) in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(root,filename)
            with io.open(filepath, 'r', encoding = 'utf-8') as f:
                text = f.read()
                paragraphs = text.split(SPLITCHAR)
                del paragraphs[0]
                i = 0
                for paragraph in paragraphs:
                    paragraph = paragraph.rstrip()
                    if paragraph:
                        if NORM:
                            paragraph = re.sub(r'\W+',' ', paragraph)
                            paragraph = re.sub(r'\d','',paragraph)
                            paragraph = re.sub(r'  +',' ', paragraph)
                            paragraph = unidecode(paragraph.lower())
                        paragraphs_ls.append(paragraph)
                        filenames_ls.append(filename+'_'+ str(i))
                        i += 1
    return filenames_ls, paragraphs_ls

### function 2: add labels to directory walk & write to dataframe

In [6]:
def make_df(path, classification):
    """ export directory walk to dataframe with CLASS INFORMATION filename as index
    """
    filenames, paragraphs = read_dir(path, NORM = True)
    rows = []
    idx = []
    i = 0
    for paragraph in paragraphs:
        rows.append({'text': paragraph, 'class': classification})
        idx.append(filenames[i])
        i += 1
    df = DataFrame(rows, index = idx)
    return df

### execute and write to .csv

In [8]:
## CLASS LABELS
NT = 'new_testament'
OT = 'old_testament'
### map CLASS to PATH
SRCS = [("DATA/KJV/OT", OT),("DATA/KJV/NT", NT)]

## Build dataframe
DATA = DataFrame({'text': [], 'class': []})
for path, classification in SRCS:
    DATA = DATA.append(make_df(path, classification))

### inspect
print DATA.shape
DATA.head()
DATA.tail()
print DATA.text.iloc[0]

## export
DATA.to_csv("DATA/CLASS_DATA.csv")

(81651, 2)
the book of esther
