# Survey Creation
This Notebook creates a survey where the replies have a similar distribution of word length then in the whole test set. The data for the survey is taken from the test data set.

In [None]:
from __future__ import unicode_literals

import os

path = os.path.realpath(os.path.join('..', '..'))
os.chdir(path)

from src.preprocessing.preprocessing import Preprocessing
from src.preprocessing.datahandler import DataHandler
from pathlib import Path

import src.tools.helpers as helpers
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:
%%time
random_seed = 1337659
survey_file = "data/survey.csv"
survey_solution = "data/survey_solution.csv"
sample_number = 40  # The number of post-reply pairs for the survey
np.random.seed(random_seed)
sw_cut_file = 'data/stop_words_cut_ultra.txt'
sw_full_file = 'data/stop_words_full_ultra.txt'
filter_stop_words = False
filter_punctuation = False
dh = DataHandler()
dh.load_train_test('data/')
test = dh.get_test_df(deep_copy=False)

## We don't want to have a post twice in the survey data
test = dh.shuffle_post_pairs(test)
test = test[0::2]

In [None]:
%%time
pp = Preprocessing(model_type='en')
nlp = pp.get_nlp()

In [None]:
def apply_spacy_pipeline(post_path, reply_path, df):
    post_dump = Path(post_path)
    reply_dump = Path(reply_path)
    if not post_dump.is_file():
        posts = pp.run_spacy_pipeline(df['post'][0::2])
        helpers.save_to_disk(posts, post_path)
    else:
        posts = helpers.load_from_disk(post_path)
    if not reply_dump.is_file():
        replies = pp.run_spacy_pipeline(df['reply'])
        helpers.save_to_disk(replies, reply_path)
    else:
        replies = helpers.load_from_disk(reply_path)
    return posts, replies


def apply_token_to_x(post_path, reply_path, posts, replies, type_):
    post_dump = Path(post_path)
    reply_dump = Path(reply_path)
    if not post_dump.is_file():
        nlp.add_stop_word_def(sw_full_file)
        post_docs = pp.filter_spacy_tokens(posts, no_stop_words=False, no_punctuation=False)
        post_pcd = pp.convert_token_docs_text(post_docs, token_kind=type_, transform_specials=True)
        helpers.save_to_disk(post_pcd, post_path)
    else:
        post_pcd = helpers.load_from_disk(post_path)
    if not reply_dump.is_file():
        nlp.add_stop_word_def(sw_cut_file)
        reply_docs = pp.filter_spacy_tokens(replies, no_stop_words=False, no_punctuation=False)
        reply_pcd = pp.convert_token_docs_text(reply_docs, token_kind=type_, transform_specials=True)
        helpers.save_to_disk(reply_pcd, reply_path)
    else:
        reply_pcd = helpers.load_from_disk(reply_path)
    return post_pcd, reply_pcd


def create_length_probability_series(plot):
    plot_length = len(plot[0])
    index = np.zeros(plot_length)
    data = np.zeros_like(index)
    sum_ = plot[0].sum()
    for i, amount in enumerate(plot[0]):
        index[i] = i + 1
        data[i] = amount / sum_
    return pd.Series(data=data, index=index)

In [None]:
%%time
posts_test, reply_test = apply_spacy_pipeline('data/posts_test_survey.pkl', 'data/replies_test_survey.pkl', test)

In [None]:
%%time
post_lower, reply_lower = apply_token_to_x('data/post_lower_test_survey.pkl'
                                           , 'data/reply_lower_test_survey.pkl'
                                           , posts_test, reply_test, 'lower_')

In [None]:
post_lengths = [len(doc) for doc in post_lower]
reply_lengths = [len(doc) for doc in reply_lower]
reply_lower[9]

In [None]:
print("Reply sentence lengths distribution")
plot = plt.hist(reply_lengths, bins=30, range=[1, 30])

In [None]:
post_lengths = np.asarray(post_lengths)
reply_lengths = np.asarray(reply_lengths)
print("Standard deviation of reply sentence lengths: {:.1f}".format(reply_lengths.std()))
print("Mean of reply sentence lengths:               {:.1f}".format(reply_lengths.mean()))

In [None]:
base_data = test
base_data.index = reply_lengths
sample_weights = create_length_probability_series(plot)
sample_weights.values.sum()

In [None]:
sample = base_data.sample(sample_number, random_state=random_seed, weights=sample_weights)
_ = plt.hist(sample.index, bins=40, range=[1, 30])
sample["sarcasm"].sum()

In [None]:
survey_data = sample[["post", "reply"]]
survey_data.to_csv(survey_file, index=False)
sample.to_csv(survey_solution)