In [1]:
#! /usr/bin/env python
# encoding: utf-8
%matplotlib inline
import pandas as pd
import numpy as np
import math
import random
import itertools
from collections import Counter, deque
import matplotlib.pyplot as plt
import seaborn as sns
import os
import unidecode
import nltk

import gensim

import cPickle as pickle
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



from sklearn.ensemble import VotingClassifier
from sklearn import cross_validation

from sklearn.utils import shuffle

from collections import defaultdict

from gensim import models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_validation import train_test_split
from sklearn.learning_curve import learning_curve
from sklearn.metrics import accuracy_score
from sklearn import cross_validation


from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import sent_tokenize


model = models.Word2Vec.load_word2vec_format('models/ruscorpora_russe.model.bin', binary=True)



#### define functions to load data

In [2]:
##for single files
def get_lines(file_path):
    lines = []
    with open(file_path, 'r') as f:
        for line in f:
            lines.append(line.strip("\n"))
    return lines

    
def get_author(dirname):
    lines = list()
    for fname in os.listdir(dirname):
        for line in open(os.path.join(dirname,fname)):
            lines.append(line.lower().strip("\n"))
    return lines


stemmer = RussianStemmer()
def stem_words(word): return stemmer.stem(word)

def strip_decode(text): return text.decode('utf-8').strip()

In [3]:
def get_chapters(path):
    '''
    Breaks a text corpus into paragraphs and returns
    the output as a list of string objects
    '''
    book = open(path).readlines()
    chapters_d = defaultdict(list)
    count = 0
    for x in book:
        if len(x) != 1:
            chapters_d[count].append(x.replace("\n",""))
        else:
            count += 1
        
        
    for k,v in chapters_d.items():
        if len(v) < 50:
            del chapters_d[k]
            
    chapters = list()
    for chap in chapters_d.values():
        chapters.append(" ".join(chap))
        
    return chapters
        

#### structure text in dataframe

In [4]:
def make_df(author, text):
    s = {'text': pd.Series([x for x in text]),
        'author': pd.Series([author for x in xrange(len(text))])}
    return pd.DataFrame(s)

def cat_dfs(df_list):
    base = df_list[0]
    for d in df_list[1:]:
        base = base.append(d, ignore_index=True)
    return base


def cleaning_pipeline(df):
    df = df.reset_index()
    df = df[df['text'].map(len) > 5]
    return df


#### feature engineering

In [5]:
def token_words(text):
    '''
    Returns all tokens in input text,
    removes special characters and punctuation
    and returns words
    '''
    tokens = nltk.word_tokenize(text.lower().strip())
    tokens = [stem_words(x) for x in tokens]
    words = [x.replace(",","").replace(".","").replace(";","").replace(":","")
                for x in tokens]
    
    return words, tokens
    


def words_per_sent(text):
    '''
    Returns the number average word count
    per sentence for a given input text
    '''
    words, tokens = token_words(text)
    
    sentences = sent_tokenize(text)
    words_per_sentence = np.array([len(nltk.word_tokenize(s))
                                       for s in sentences])
    return words_per_sentence.mean()


def lexical_div(text):
    '''
    Returns the ratio of unique words
    to total words in a given text to
    measure diversity in diction
    '''
    words, tokens = token_words(text)
    vocab = set(words)
    
    return len(vocab) / float(len(words))


def len_var(text):
    '''
    Returns the variance between lengths of each
    sentence in the paragraph
    '''
    sentences = sent_tokenize(text)
    words_per_sentence = np.array([len(nltk.word_tokenize(s))
                                       for s in sentences])
    return words_per_sentence.std()


def punct_per_sent(text, punct):
    '''
    Returns the average punctuation count
    per sentence in the paragraph
    '''
    sentences = sent_tokenize(text)
    tokens, words = sent_tokenize(text)
    return tokens.count(str(punct)) / float(len(sentences))


def syntax_vector(text):
    '''
    Returns the distribution of select
    parts of speech in the paragraph
    '''
    tokens = nltk.word_tokenize(text)
    tags = [p[1] for p in nltk.pos_tag(tokens)]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    tag_list = np.array([tags.count(pos) for pos in pos_list])
    return tag_list.std()


def get_word_vectors(text):
    '''
    Positions new words from paragraph
    with vectors from trained model
    '''
    words, tokens = token_words(text)
    vecs = []
    for word in words:
        try:
            vecs.append(model[word].reshape((1,300)))
        except KeyError:
            continue
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype='float')


def vector_avg(auth):
    '''
    Returns the average of vector
    coordinates for a target text
    '''

    vectors = get_word_vectors(auth)
    vectors_sum = 0
    count = 0
    for v in vectors:
        count += 1
        vectors_sum = np.add(vectors_sum,v)

    #calculate the average vector and replace +infy and -inf with numeric values 
    avg_vector = np.nan_to_num(vectors_sum/count)
    return sum(avg_vector)**2



def features_pipe(df):
    df['text'] = df['text'].map(strip_decode)
    df['wps'] = df['text'].map(words_per_sent)
    df['lex_fv'] = df['text'].map(lexical_div)
    df['len_var'] = df['text'].map(len_var)
    df['syntax_fv'] = df['text'].map(syntax_vector)
    df['word_vec'] = df['text'].map(vector_avg)
    
    return df

##### append syntactic features

In [14]:
def auth_gen(**kwargs):
    authors_d = dict()
    for k,v in kwargs.items():
        if k in authors_d:
            authors_d[k].append(get_chapters('authors/'+k+'/'+v+".txt"))
        authors_d[k] = get_chapters('authors/'+k+'/'+v+".txt")
        
    dfs = [make_df(k,v) for k,v in authors_d.items()]
    df = cleaning_pipeline(cat_dfs(dfs))
    return features_pipe(df)

In [18]:
auth_works = {'zamyatin':'we', 'turgenev':'fathers_sons'}
df = auth_gen(**auth_works)

In [19]:
def get_xy(df):
    y = df['author']
    X = df
    X.drop('author', axis=1,inplace=True)
    X.drop('text', axis=1, inplace=True)
    X.drop('index', axis=1, inplace=True)
    X, y = shuffle(X,y, random_state=0)
    return X,y

In [20]:
X,y = get_xy(df)

##### build model

In [1]:
def train_predict(X, y):
    '''
    
    second half of process uses ensemble methods with word vector features
    and lexographic, syntactic features, returns class predictions
    '''
    
    

    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    #clf3 = SVC()
    
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3)],
                            voting='soft', weights=[1,1,1])

    np.random.seed(123)
    
    
    for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression',
                                                     'Random Forest','naive Bayes', 'Ensemble']):
        scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
        
    
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

    

In [21]:
train_predict(X,y)

Accuracy: 0.84 (+/- 0.14) [Logistic Regression]
Accuracy: 0.85 (+/- 0.15) [Random Forest]
Accuracy: 0.84 (+/- 0.14) [naive Bayes]
Accuracy: 0.84 (+/- 0.20) [Ensemble]
