In [14]:
%matplotlib inline

import library
import graphs

import analysis.basics as basics

import numpy as np
import os

In [None]:
# Access the document directories
dirs = ['/Users/hunterheidenreich/git/MapMyWriting/data/facebook_chat',
       '/Users/hunterheidenreich/git/MapMyWriting/data/facebook_comments',
       '/Users/hunterheidenreich/git/MapMyWriting/data/facebook_posts',
       '/Users/hunterheidenreich/git/MapMyWriting/data/journals',
       '/Users/hunterheidenreich/git/MapMyWriting/data/school',
       '/Users/hunterheidenreich/git/MapMyWriting/data/songs']

input_files = []
for text_dir in dirs:
    input_files += library.get_file_list(text_dir)

In [None]:
# Load into a collection of texts
text_collection = {}

for f in input_files:
    base = os.path.basename(f)
    
    if base not in text_collection:
        text_collection[base] = library.FileObject()
    
    with open(f, 'r') as in_file:
        text_collection[base]._lines += in_file.readlines()

In [None]:
len(text_collection.items())

In [None]:
# Do pre-processing for the texts

for key, value in text_collection.items():
    raw, stop, stem, lemma = library.preprocess_text(value.lines)
    text_collection[key].raw = raw
    text_collection[key].stop = stop
    text_collection[key].stem = stem
    text_collection[key].lemma = lemma

In [None]:
lemmas = []
for key, value in text_collection.items():
    lemmas.append(value.lemma)

In [None]:
raws = []
names = []
for key, value in text_collection.items():
    raws.append(value.raw)
    names.append(key)

In [None]:
# Looking at word usage graphically 
counter = basics.VocabUtils.aggregate_words_counts(lemmas)
word_counts_raw = list(counter.values())
word_counts_sorted = sorted(word_counts_raw)

cap = min(len(word_counts_raw), 100)

graphs.plot_bar_graph(range(len(word_counts_raw[:cap])), word_counts_raw[:cap], 
                      x_label='Words', y_label='Counts', title='Word Frequencies (Raw)',
                      export=True, export_name='visualizations/word_freq_bar_raw.png')
graphs.plot_bar_graph(range(len(word_counts_sorted[:cap])), word_counts_sorted[-cap:], 
                      x_label='Words', y_label='Counts', title='Word Frequencies (Sorted)',
                      export=True, export_name='visualizations/word_freq_bar_sorted.png')

In [None]:
# Looking some basic statistics for vocab usage 
counter = basics.VocabUtils.aggregate_words_counts(lemmas)
data = [[len(input_files)],
        [basics.VocabUtils.global_top_k_words(lemmas, k=1)[0][0] + ' (' + str(basics.VocabUtils.global_top_k_words(lemmas, k=1)[0][1]) + ')'], 
        [basics.VocabUtils.global_top_k_words(lemmas, k=len(list(counter.keys())))[-1][0] + ' (' + str(basics.VocabUtils.global_top_k_words(lemmas, k=len(list(counter.keys())))[-1][1]) + ')'],
        [len(counter.items())],
        [np.mean(list(counter.values()))],
        [sum([len(word_list) for word_list in raws])],
        [sum([sum([len(w) for w in word_list]) for word_list in raws])]]
row_labels = ['Collection size: ', 'Top word: ', 'Least common: ', 'Vocab size: ', 'Average word usage count: ',
              'Total words: ', 'Total characters: ']

graphs.plot_table(cell_data=data, row_labels=row_labels,
                  export=True, export_name='visualizations/basic_stats_table.png')

In [None]:
# Let's look at word count over time
raw_wc = [len(word_list) for word_list in raws]
labels = library.get_datetimes(names)

labels, raw_wc = zip(*sorted(zip(labels, raw_wc)))

graphs.plot(labels, raw_wc, 
            x_label='Date', y_label='Word Count', title='Word Count Over Time',
            export=True, export_name='visualizations/word_count_by_time.png')

raw_wc_u = [len(list(basics.VocabUtils.unique_vocab(word_list).items())) for word_list in raws]
graphs.plot(labels, raw_wc_u, 
            x_label='Date', y_label='Word Count', title='Unique Word Count Over Time',
            export=True, export_name='visualizations/unique_word_count_by_time.png')

In [1]:
from sentiment import SentimentAnalyzer

In [2]:
s = SentimentAnalyzer()

In [5]:
a = s.embed_sentence('This is a test')

In [20]:
a0 = np.mean(a[0], axis=0)
a1 = np.mean(a[1], axis=0)
a2 = np.mean(a[2], axis=0)
red = (a0 + a1 + a2) / 3

In [23]:
len(red)

1024