In [None]:
from __future__ import division  # no need to worry about interger division

from pymongo import MongoClient
import psycopg2 as pg2
import json

from pandas.io.sql import read_sql
import pandas as pd
import numpy as np

from itertools import chain, izip

import matplotlib.pylab as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.set(font='Avenir', style='white')

In [None]:
with open('../../auth/postgres/postgres.json', 'r') as f:
    PGCONN = json.load(f)

with open('../.dbname', 'r') as f:
    PGCONN['dbname'] = json.load(f)['dbname']

In [None]:
conn = pg2.connect(**PGCONN)

q = '''
    SELECT * FROM basic_stats2
    WHERE author = 'satoshi';
    '''

df = read_sql(q, conn)

conn.close()

In [None]:
df.shape

In [None]:
df.groupby('type').describe()

In [None]:
df_sum = df.groupby('type').sum()

# Sentences:

### Observations:
* hard to extract paragraph from e-mail and forum posts
    * sentence/paragraph not comparable between paper/email vs. forum (mostly answering people's question)
* word count: 
    * paper ~ 3000; 
    * email ~ 5000; 
    * forum ~ 50000
* space after period: 
    * paper: ~one space
    * email: ~two spaces
    * forum: ~one space
    * periods_twospaces/periods_onespace -- scale goes from 0 to inf -- not a good feature
* sentences:
    * upper + lower ~= 88% of punkt -- use punkt

    
### Preliminary set of features author attribution (satoshi vs. satoshi):
* periods_onespace/sentences_punkt
* periods_twospaces/sentences_punkt
* words/sentence_punkt

In [None]:
df_sum[['words',
        'periods_nospace',
        'periods_onespace',
        'periods_twospaces']]

In [None]:
df_sum[['sentences_punkt',
        'sentences_upper',
        'sentences_lower']]

In [None]:
(df_sum['sentences_upper'] + df_sum['sentences_lower']) / df_sum['sentences_punkt']

In [None]:
df_sum['words/sentences_punkt']       = df_sum['words'] / df_sum['sentences_punkt']
df_sum['words/sentences_upper_lower'] = df_sum['words'] / (df_sum['sentences_upper'] + df_sum['sentences_lower'])

In [None]:
df_sum[['words/sentences_punkt',
        'words/sentences_upper_lower']]

# Words:

### Observations:
* all types of writing: words_US >= words_GB -- how much of the difference is because of the dictionaries used?

### Preliminary set of features for author attribution (satoshi vs. satoshi):
* words_nostop/words
* words_primaryvb/words
* words_to/words
* words_verb/words
* words_noun/words
* words_desc/words
* words_det/words
* words_conj/words
* words_gb/words_us

In [None]:
df_sum['words_nostop/words']    = df_sum['words_nostop']    / df_sum['words']
df_sum['words_primaryvb/words'] = df_sum['words_primaryvb'] / df_sum['words']
df_sum['words_to/words']        = df_sum['words_to']        / df_sum['words']

In [None]:
df_sum[['words_nostop/words',
        'words_primaryvb/words', 
        'words_to/words']]

In [None]:
word_types = {'verb': ['vb', 'vbd', 'vbn', 'vbp', 'vbz'],
              'noun': ['nn', 'nns', 'nnp', 'nnps', 'prp', 'pp$'],                          # noun, pronoun
              'desc': ['jj', 'jjr', 'jjs', 'rb', 'rbr', 'rbs', 'wdt', 'wp', 'wp$', 'wrb'], # adj, adv, "which what how, etc"
              'det' : ['dt', 'pdt'],                                                       # determiner
              'conj': ['cc', 'in']}                                                        # conjugate-like

cnames  = set(df.columns)

col_map = dict()
for key, lst in word_types.iteritems():
    cname = 'words_%s' % key
    col_map[cname] = set('words_' + c for c in lst)
    df_sum[cname]  = df_sum[list(cnames & col_map[cname])].sum(axis=1)
    df_sum['%s/words' % cname] = df_sum[cname] / df_sum['words']

In [None]:
df_sum[[a+b for a,b in izip(col_map.keys(), ['/words']*len(col_map))]]


In [None]:
df_sum[['words_us', 'words_gb']]

In [None]:
df_sum['words_us/words']    = df_sum['words_us'] / df_sum['words']
df_sum['words_gb/words']    = df_sum['words_gb'] / df_sum['words']
df_sum['words_gb/words_us'] = df_sum['words_gb'] / df_sum['words_us']

In [None]:
df_sum[['words_us/words', 'words_gb/words', 'words_gb/words_us']]

# Spellings:


In [None]:
client = MongoClient()
db  = client['satoshi']
tbl = db['word-lists2']

In [None]:
dic = dict()

for wt in ['paper', 'email', 'forum']:
    
    query_results = tbl.find( { 'author' : 'satoshi',
                                'type'   : wt         } )

    # cursor only in one direction!
    words = [(result['misspellings'], result['us_spellings'], result['gb_spellings']) for result in query_results]
    dic['%s_misspell' % wt], dic['%s_US' % wt], dic['%s_GB' % wt] = zip(*words)


In [None]:
# total words
for wt in ['paper', 'email', 'forum']:
    print wt+':', len(list(chain(*dic['%s_US' % wt]))), len(list(chain(*dic['%s_GB' % wt])))

In [None]:
# unique words
for wt in ['paper', 'email', 'forum']:
    print wt+':', len(set(chain(*dic['%s_US' % wt]))), len(set(chain(*dic['%s_GB' % wt])))

# Word list... should have lemmatized!!
* vbs in different tense...
* singular/plural nouns
* lemmatize **DOES NOT** mean change word to another word with same meaning e.g. Honda -> car
* add lemmatizer and start all over again

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = []
for word in 'if you red thinking of'.split():
    # filter -- if word can be many POS, select the POS matching the filter
    v = lemmatizer.lemmatize(word, pos='v')  
    if v == word:
        v = lemmatizer.lemmatize(word, pos='n')  
    words += v.encode('ascii'),
words