In [None]:
import fix_notebook_imports

from src import util

import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("poster")


In [None]:
from src import util

In [None]:
len(util.PARAGRAPHS)


In [None]:
sum_ = sum([len(paragraph) for paragraph in util.PARAGRAPHS])

print('Average number of words per paragraph: {}'.format(round(sum_/len(util.PARAGRAPHS), 2)))

In [None]:
counts = collections.Counter(util.WORDS)

ordered_counts = [(key, val) for val, key in sorted([(j, i) for i, j in counts.items()], reverse=True)]

print('Number of distinct words: {}'.format(len(ordered_counts)))
ordered_counts


In [None]:
def getEndings(series):
    return [(key[len(series):], val) for key, val in ordered_counts if key[:len(series)] == series]

In [None]:
def getBeginnings(series):
    return [(key[:-len(series)], val) for key, val in ordered_counts if key[-len(series):] == series]

In [None]:
def get_ratios(series):
    n = series.sum()
    return [x/n for x in series]

In [None]:
def get_means(series):
    return series.mean()

In [None]:
def test_letter_class(letter, class_endings):
    
    class_vocab = []
    base_to_ending_counts_dict = collections.defaultdict(list)
    df_base_ending_counts = pd.DataFrame()
    
    class_endings = sorted(class_endings, key=lambda x: len(x), reverse=True)
    
    for ending in class_endings:
        for base, value in getBeginnings(ending):
            if (base+ending not in class_vocab) and (base+ending in util.VOCAB):
                base_to_ending_counts_dict[base].append((ending, sum([1 for w in util.WORDS if w == base+ending])))
                class_vocab.append(base+ending)
            
    words_count = sum([1 for word in util.WORDS if word in class_vocab])
    print('Proportion of {}-class words to total words: {}'.format(letter, round(words_count/len(util.WORDS), 3)))

    for i, base in enumerate(base_to_ending_counts_dict.keys()): 
        ending_to_value_dict = {e : v for e, v in base_to_ending_counts_dict[base]}
        values = []
        for ending in class_endings:
            if ending in ending_to_value_dict.keys():
                values.append(ending_to_value_dict[ending])
            else:
                values.append(0)
        df_base_ending_counts[base] = pd.Series(values)

    df_base_ending_counts['Suffix'] = [x for x in class_endings]
    df_base_ending_counts = df_base_ending_counts.set_index('Suffix')
    
    df_base_ending_counts.loc["sum"] = df_base_ending_counts.sum(axis=0, numeric_only=True)
    df_base_ending_counts = df_base_ending_counts.sort_values(axis=1, by="sum", ascending=False).drop("sum")
    
    df_base_ending_ratios = df_base_ending_counts.apply(get_ratios)
    df_base_ending_ratios['Suffix'] = [x for x in class_endings]
    df_base_ending_ratios = df_base_ending_ratios.set_index('Suffix')

    return (class_vocab, base_to_ending_counts_dict, df_base_ending_counts, df_base_ending_ratios)

In [None]:
a_endings = ['am', 'ar', 'al', 'an', 'ain', 'aiin', 'aiiin']
o_endings = ['ol', 'or', 'o']
y_endings = ['y', 'dy', 'ey', 'edy', 'eey', 'eedy', 'eeey', 'eeedy']


In [None]:
vocab_a, base_to_ending_counts_dict_a, df_base_ending_counts_a, df_base_ending_ratios_a = test_letter_class('a', a_endings)
vocab_o, base_to_ending_counts_dict_o, df_base_ending_counts_o, df_base_ending_ratios_o = test_letter_class('o', o_endings)
vocab_y, base_to_ending_counts_dict_y, df_base_ending_counts_y, df_base_ending_ratios_y = test_letter_class('y', y_endings)


In [None]:
def plot_ratios(df_base_ending_ratios, NUM_POINTS):
    plt.rcParams['figure.figsize']=(8,6)
    swarm = sns.violinplot(data=df_base_ending_ratios.iloc[:,:NUM_POINTS].transpose(), orient='v', scale='count')

In [None]:
def plot_endings(df_base_ending_counts, class_endings, NUM_POINTS):
    plt.rcParams['figure.figsize']=(16,12)
    heatmap = sns.heatmap(df_base_ending_counts.iloc[:,:NUM_POINTS].transpose().corr(), vmin=0.0, vmax=1.0, annot=True, xticklabels=['-'+ x for x in class_endings], yticklabels=['-'+ x for x in class_endings])

    plt.rcParams['figure.figsize']=(10,8)
    scatter = pd.plotting.scatter_matrix(df_base_ending_counts.iloc[:,:NUM_POINTS].transpose(), diagonal='kde')

In [None]:
plot_ratios(df_base_ending_ratios_a, NUM_POINTS=40)


In [None]:
plot_endings(df_base_ending_counts_a, a_endings, NUM_POINTS=40)


In [None]:
plot_ratios(df_base_ending_ratios_o, NUM_POINTS=40)


In [None]:
plot_endings(df_base_ending_counts_o, o_endings, NUM_POINTS=40)


In [None]:
plot_ratios(df_base_ending_ratios_y, NUM_POINTS=40)


In [None]:
plot_endings(df_base_ending_counts_y, y_endings, NUM_POINTS=40)


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_base_ending_counts_a)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_base_ending_counts_o)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    display(df_base_ending_counts_y)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_base_ending_ratios_a)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_base_ending_ratios_o)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df_base_ending_ratios_y)