Provides visualisations for index document lengths - pre stopword removal - as well as post stopword removal.

Also provides for visualisation of word distribution in collections and for stopword list overlaps. 

In [None]:
# import everything as needed
%matplotlib inline

from typing import List 
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import os

#Set general plot properties
sns.set()
sns.set_color_codes("pastel")

sns.set_context({"figure.figsize": (16, 10)})
plt.style.use('seaborn-white')

In [None]:
def read_doc_len_file(path: str):
    lens = []
    with open(path) as f: 
        for line in f:
            lens.append(int(line.strip()))
            
    return lens 

In [None]:
# These doc lens are generated from pre-stopped indices
dirs = ['aus', 'aus', 'sigir']

index_display_names = ('AUS', 'FILTERED', 'SIGIR')
index_names = ['flattened', 'filtered', 'sigir']
doc_lens = []
stopped_lens = []
stop_prefixes = ['prestop-']

for i in range(len(stop_prefixes)): 
    for in_name in index_names: 
        if i % 2 == 0:
            stopped_lens.append(read_doc_len_file("../features/{0}{1}-doc_lens.txt".format(stop_prefixes[i], in_name)))
        else:
            doc_lens.append(read_doc_len_file("../features/{0}{1}-doc_lens.txt".format(stop_prefixes[i], in_name)))

        
def plot_lens(lens, names):
    fig, axs = plt.subplots(1, len(lens))
    fig.set_size_inches(16, 8)
    for i in range(len(lens)):
        sns.distplot(lens[i], kde=False, ax=axs[i])
        axs[i].set_xticks([])
        axs[i].set_xlabel(names[i])

In [None]:
plot_lens(stopped_lens, index_display_names)

In [None]:
def plot_bar_whisker_lens(lens, names):
    fig, axs = plt.subplots(1, len(lens))
    fig.set_size_inches(16, 8)
    axs[0].set_ylabel('Doc Len', fontsize='20')
    for i in range(len(lens)):
        axs[i].boxplot(lens[i])
        axs[i].set_xticks([])
        axs[i].set_xlabel(names[i], fontsize='20')
        
    return fig 
        
# plot_bar_whisker_lens(stopped_lens, index_display_names).savefig('doclens.png', bbox_inches = 'tight', pad_inches = 0.02)

In [None]:
# compute averages
def get_mean_len(lens):
    df = pd.DataFrame.from_dict({x: lens[i] for i, x in enumerate(index_names)}, orient='index')
    df = df.transpose()
    df.mean()
    return df.mean().to_latex()

In [None]:
print(get_mean_len(stopped_lens))

In [None]:
def read_countfile(path: str) -> pd.DataFrame: 
    words = []
    counts = []
    with open(path) as f: 
        for line in f:
            line = line.strip()
            parts = line.split()
            words.append(parts[0])
            counts.append(int(parts[1]))
            
    return pd.DataFrame({'words': words, 'counts': counts})
    
top_tokens = []
for i in index_names: 
    top_tokens.append(read_countfile("../features/{0}-top-tokens.txt".format(i)))

In [None]:
def plot_top_tokens(tokens: List[pd.DataFrame], names, n: int): 
    fig, axs = plt.subplots(1, len(top_tokens))
    fig.set_size_inches(16, 8)
    axs[0].set_ylabel('Frequency', fontsize=20)
    for i in range(len(top_tokens)):
        axs[i].plot(top_tokens[i]['counts'][:n])
        axs[i].set_xticks([])
        axs[i].tick_params(labelsize=15)
        axs[i].set_xlabel(names[i], fontsize=20)
        
    return fig 

In [None]:
plot_top_tokens(top_tokens, index_display_names, 100).savefig('top-tokens.png', bbox_inches = 'tight', pad_inches = 0.02)

In [None]:
top_tokens[2][:20]