# Exploration of text lengths
We want to take a look at the word lengths of the posts and replies and their distribution. With filtering of unfrequent words and without that.

In [None]:
from __future__ import unicode_literals

import os

path = os.path.realpath(os.path.join('..', '..'))
os.chdir(path)

from src.tools.helpers import load_from_disk, save_to_disk
from src.preprocessing.preprocessing import Preprocessing
from src.preprocessing.datahandler import DataHandler
from pathlib import Path

import src.tools.helpers as helpers
import matplotlib.pyplot as plt
import numpy as np


In [None]:
%%time
sw_cut_file = 'data/stop_words_cut_ultra.txt'
sw_full_file = 'data/stop_words_full_ultra.txt'
filter_stop_words = False
filter_punctuation = False
dh = DataHandler()
dh.load_train_test('data/')
df = dh.get_train_df(deep_copy=False)

In [None]:
%%time
pp = Preprocessing(model_type='en')
nlp = pp.get_nlp()

In [None]:
%%time
post_dump = Path('data/post_lower.pkl')
reply_dump = Path('data/reply_lower.pkl')
if not (post_dump.is_file() and reply_dump.is_file()):
    post_dump = Path('data/posts.pkl')
    reply_dump = Path('data/replies.pkl')
    if not post_dump.is_file():
        posts = pp.run_spacy_pipeline(df['post'][0::2])
        save_to_disk(posts, 'data/posts.pkl')
    else:
        posts = load_from_disk('data/posts.pkl')
    if not reply_dump.is_file():
        replies = pp.run_spacy_pipeline(df['reply'])
        save_to_disk(replies, 'data/replies.pkl')
    else:
        replies = load_from_disk('data/replies.pkl')

In [None]:
%%time
post_dump = Path('data/post_lower.pkl')
reply_dump = Path('data/reply_lower.pkl')
if not post_dump.is_file():
    nlp.add_stop_word_def(sw_full_file)
    post_docs = pp.filter_spacy_tokens(posts, no_stop_words=filter_stop_words
                                       , no_punctuation=filter_punctuation)
    post_lower = pp.convert_token_docs_text(post_docs, transform_specials=True)
    save_to_disk(post_lower, 'data/post_lower.pkl')
else:
    post_lower = load_from_disk('data/post_lower.pkl')
if not reply_dump.is_file():
    nlp.add_stop_word_def(sw_cut_file)
    reply_docs = pp.filter_spacy_tokens(replies, no_stop_words=filter_stop_words
                                        , no_punctuation=filter_punctuation)
    reply_lower = pp.convert_token_docs_text(reply_docs, transform_specials=True)
    save_to_disk(reply_lower, 'data/reply_lower.pkl')
else:
    reply_lower = load_from_disk('data/reply_lower.pkl')

## This is without word filtering based on frequency:

In [None]:
post_lengths = [len(doc) for doc in post_lower]
reply_lengths = [len(doc) for doc in reply_lower]

In [None]:
reply_tokens = [word for doc in reply_lower for word in doc]
post_tokens = [word for doc in post_lower for word in doc]

In [None]:
_ = plt.hist(post_lengths, bins=100, range=[0, 30])

In [None]:
post_lengths_ar = np.asarray(post_lengths)
word_types = len(set(post_tokens))
print("Posts:")
print("word tokens: {:,.0f}".format(post_lengths_ar.sum()))
print("word types:  {:,.0f}".format(word_types))
print("std:         {:.2f}".format(post_lengths_ar.std()))
print("mean:        {:.2f}".format(post_lengths_ar.mean()))

In [None]:
_ = plt.hist(reply_lengths, bins=100, range=[0, 30])

In [None]:
reply_lengths_ar = np.asarray(reply_lengths)
word_types = len(set(reply_tokens))
print("Replies:")
print("word tokens: {:,.0f}".format(reply_lengths_ar.sum()))
print("word types:  {:,.0f}".format(word_types))
print("std:         {:.2f}".format(reply_lengths_ar.std()))
print("mean:        {:.2f}".format(reply_lengths_ar.mean()))

In [None]:
comments = reply_lower + post_lower
word_types = len(set(helpers.flatten(comments)))
print("word types combined: {:,.0f}".format(word_types))

In [None]:
post_cut = [num for num in post_lengths if num <= 20]
reply_cut = [num for num in reply_lengths if num <= 20]
print("Percentage of posts for length <=20:", (len(post_cut) / len(post_lengths)))
print("Percentage of replies for length <=20:", (len(reply_cut) / len(reply_lengths)))

## This is with word filtering based on frequency

In [None]:
%%time
post_feats, _ = pp.filter_by_frequency(post_lower, min_freq=3)
reply_feats, _ = pp.filter_by_frequency(reply_lower, min_freq=3)

In [None]:
reply_tokens = [word for doc in reply_feats for word in doc]
post_tokens = [word for doc in post_feats for word in doc]

In [None]:
post_lengths = [len(doc) for doc in post_feats]
reply_lengths = [len(doc) for doc in reply_feats]

In [None]:
_ = plt.hist(post_lengths, bins=100, range=[0, 30])

In [None]:
post_lengths_ar = np.asarray(post_lengths)
word_types = len(set(post_tokens))
print("Posts:")
print("word tokens:     {:,.0f}".format(post_lengths_ar.sum()))
print("word types: {:,.0f}".format(word_types))
print("std:            {:.2f}".format(post_lengths_ar.std()))
print("mean:           {:.2f}".format(post_lengths_ar.mean()))

In [None]:
_ = plt.hist(reply_lengths, bins=100, range=[0, 30])

In [None]:
reply_lengths_ar = np.asarray(reply_lengths)
word_types = len(set(reply_tokens))
print("Replies:")
print("word tokens: {:,.0f}".format(len(reply_tokens)))
print("word types:  {:,.0f}".format(word_types))
print("std:         {:.2f}".format(reply_lengths_ar.std()))
print("mean:        {:.2f}".format(reply_lengths_ar.mean()))

In [None]:
filtered_comments, _ = pp.filter_by_frequency(reply_lower + post_lower, min_freq=3)
word_types = len(set(helpers.flatten(filtered_comments)))
print("word types combined: {:,.0f}".format(word_types))

In [None]:
post_cut = [num for num in post_lengths if num <= 30]
reply_cut = [num for num in reply_lengths if num <= 30]
print("Percentage of posts for length <=20:", (len(post_cut) / len(post_lengths)))
print("Percentage of replies for length <=20:", (len(reply_cut) / len(reply_lengths)))