In [None]:
from __future__ import unicode_literals

import os
from collections import Counter

import numpy as np
import torch

path = os.path.realpath(os.path.join('..', '..'))
os.chdir(path)

from src.preprocessing.preprocessing import Preprocessing
from src.data_science.networkhelper import NetworkHelper
from src.preprocessing.datahandler import DataHandler

from pathlib import Path
import src.tools.helpers as helpers
import matplotlib.pyplot as plt
import pandas as pd

### You have to delete every file with _emb_ and counter in it after test_file name change

In [None]:
test_file = "survey_test"  # Write NO file ending !!!
test_file_data_num = 40

In [None]:
def load_train_test(data_dir, dh, file_to_test):
    train_file = data_dir / "train.csv"
    test_file = data_dir / (file_to_test + ".csv")
    if not (train_file.is_file()):
        comments = str(data_dir / "comments_cleaned.txt")
        annotation = str(data_dir / "annotation.txt")
        dh.load_data(comments, annotation)
        dh.split_in_train_test()
        dh.save_train_test_to_csv(str(data_dir))
    else:
        dh.load_train_test(str(data_dir))
    train = dh.get_train_df(deep_copy=False)
    if file_to_test == "test":
        test = dh.get_test_df(deep_copy=False)
    else:
        dtype = {'post_id': np.str, 'post': np.str
            , 'reply': np.str, 'sarcasm': np.int8}
        test = pd.read_csv(test_file, sep='\t', keep_default_na=False
                           , na_values="", dtype=dtype)
    return test, train

In [None]:
%%time
data_dir = Path('data')
sw_cut_file = str(data_dir / 'stop_words_cut_ultra.txt')
sw_full_file = str(data_dir / 'stop_words_full_ultra.txt')
nh = NetworkHelper()
dh = DataHandler()
test, train = load_train_test(data_dir, dh, test_file)

In [None]:
max_post_len = 50

In [None]:
%%time
pp = Preprocessing()
nlp = pp.get_nlp()

In [None]:
def apply_spacy_pipeline(post_path, reply_path, df):
    post_dump = Path(post_path)
    reply_dump = Path(reply_path)
    if not post_dump.is_file():
        posts = pp.run_spacy_pipeline(df['post'][0::2])
        helpers.save_to_disk(posts, post_path)
    else:
        posts = helpers.load_from_disk(post_path)
    if not reply_dump.is_file():
        replies = pp.run_spacy_pipeline(df['reply'])
        helpers.save_to_disk(replies, reply_path)
    else:
        replies = helpers.load_from_disk(reply_path)
    return posts, replies


def apply_token_to_x(post_path, reply_path, posts, replies, type_):
    post_dump = Path(post_path)
    reply_dump = Path(reply_path)
    if not post_dump.is_file():
        nlp.add_stop_word_def(sw_full_file)
        post_docs = pp.filter_spacy_tokens(posts, no_stop_words=False, no_punctuation=False)
        post_pcd = pp.convert_token_docs_text(post_docs, token_kind=type_, transform_specials=True)
        helpers.save_to_disk(post_pcd, post_path)
    else:
        post_pcd = helpers.load_from_disk(post_path)
    if not reply_dump.is_file():
        nlp.add_stop_word_def(sw_cut_file)
        reply_docs = pp.filter_spacy_tokens(replies, no_stop_words=False, no_punctuation=False)
        reply_pcd = pp.convert_token_docs_text(reply_docs, token_kind=type_, transform_specials=True)
        helpers.save_to_disk(reply_pcd, reply_path)
    else:
        reply_pcd = helpers.load_from_disk(reply_path)
    return post_pcd, reply_pcd


def conv_str_to_emb_idx(post_path, reply_path, posts, replies, word_idx, max_len=1000):
    post_dump = Path(post_path)
    reply_dump = Path(reply_path)
    if not (post_dump.is_file() and reply_dump.is_file()):
        post_emb = nh.convert_str_to_emb_idx(posts, word_idx, max_len)
        reply_emb = nh.convert_str_to_emb_idx(replies, word_idx, max_len)
        helpers.save_to_disk(post_emb, post_path)
        helpers.save_to_disk(reply_emb, reply_path)
    else:
        post_emb = helpers.load_from_disk(post_path)
        reply_emb = helpers.load_from_disk(reply_path)
    return post_emb, reply_emb


def get_labels(train_path, test_path, train, test):
    train_dump = Path(train_path)
    test_dump = Path(test_path)
    if not (train_dump.is_file() and test_dump.is_file()):
        train = train.values.astype(dtype=np.long, copy=False)
        test = test.values.astype(dtype=np.long, copy=False)
        train = torch.from_numpy(train)
        test = torch.from_numpy(test)
        helpers.save_to_disk(train, train_path)
        helpers.save_to_disk(test, test_path)
    else:
        train = helpers.load_from_disk(train_path)
        test = helpers.load_from_disk(test_path)
    return train, test


def get_length_tensor(replies, posts):
    reply_length = helpers.create_length_tensor(replies)
    post_length = helpers.create_length_tensor(posts)
    return post_length, reply_length

In [None]:
%%time
posts_train, reply_train = apply_spacy_pipeline('data/posts.pkl', 'data/replies.pkl', train)
# posts_train = None
# reply_train = None

In [None]:
%%time
posts_test, reply_test = apply_spacy_pipeline('data/posts_' + test_file + '.pkl'
                                              , 'data/replies_' + test_file + '.pkl', test)
# reply_test = None
# posts_test = None

In [None]:
%%time
post_conv_train, reply_conv_train = apply_token_to_x('data/post_lower.pkl', 'data/reply_lower.pkl'
                                                     , posts_train, reply_train, 'lower_')
post_conv_test, reply_conv_test = apply_token_to_x('data/post_lower_' + test_file + '.pkl'
                                                   , 'data/reply_lower_' + test_file + '.pkl'
                                                   , posts_test, reply_test, 'lower_')

%%time
post_conv_train, reply_conv_train = apply_token_to_x('data/post_text_train.pkl', 'data/reply_text_train.pkl'
                                                           , posts_train, reply_train, 'text')
post_conv_test, reply_conv_test = apply_token_to_x('data/post_text_test.pkl'
                                                         , 'data/reply_text_test.pkl'
                                                         , posts_test, reply_test, 'text')

%%time
post_feats_tr, _ = pp.filter_by_frequency(post_conv_train, min_freq=3)
reply_feats_tr, _ = pp.filter_by_frequency(reply_conv_train, min_freq=3)
post_feats_te, _ = pp.filter_by_frequency(post_conv_test, min_freq=3)
reply_feats_te, _ = pp.filter_by_frequency(reply_conv_test, min_freq=3)
complete_filtered = post_feats_te + post_feats_tr + reply_feats_te + reply_feats_tr
complete_filtered = helpers.flatten(complete_filtered)
counter_filtered = Counter(complete_filtered)
print(np.asarray([1 for k in counter_filtered]).sum())

In [None]:
complete_tokens = post_conv_test + reply_conv_test + post_conv_train + reply_conv_train
complete_tokens = helpers.flatten(complete_tokens)
counter = Counter(complete_tokens)
helpers.save_to_disk(counter, 'data/counter_lower.pkl')

In [None]:
post_conv_train

In [None]:
print("Word types: ", len(counter))

In [None]:
%%time
vector_file = 'data/word_vectors/fastText/ft_2M_300.csv'
word_list, vectors = dh.load_word_vectors(vector_file, int(2e6 - 1), 300)
word_idx = helpers.idx_lookup_from_list(word_list)
vector_t = dh.conv_inner_to_tensor(vectors)

%%time
vector_file = 'data/word_vectors/word2vec/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(vector_file, binary=True)
word_idx = helpers.idx_lookup_from_list(model.index2word)
vector_t = dh.conv_inner_to_tensor(model.vectors)

In [None]:
%%time
vocab = nh.create_tt_vocab_obj(counter, word_idx, vector_t, max_size=None, min_freq=1)

In [None]:
assert len(vocab.itos) == len(vocab.vectors)
assert len(vocab.itos) <= 1 + len({w for w in counter if counter[w] >= 1})

In [None]:
length = 0
dict_ = vocab.stoi
for k in dict_:
    if dict_[k] != -1:
        length += 1
length

train_ = helpers.flatten(post_conv_train + reply_conv_train)
train_c = Counter(train_)
train_ = set(train_)
test_ = helpers.flatten(post_conv_test + reply_conv_test)
test_c = Counter(test_)
test_ = set(test_)
len({w for w in train_c if train_c[w] >=3})
test_.difference(train_)

In [None]:
%%time
post_emb_train, reply_emb_train = conv_str_to_emb_idx('data/post_emb_train_lower.pkl'
                                                      , 'data/reply_emb_train_lower.pkl'
                                                      , post_conv_train
                                                      , reply_conv_train, vocab.stoi
                                                      , max_len=max_post_len)
post_emb_test, reply_emb_test = conv_str_to_emb_idx('data/post_emb_' + test_file + '_lower.pkl'
                                                    , 'data/reply_emb_' + test_file + '_lower.pkl'
                                                    , post_conv_test
                                                    , reply_conv_test, vocab.stoi
                                                    , max_len=max_post_len)

In [None]:
if test_file == "test":
    data_count = 196526 + test_file_data_num
    assert len(reply_emb_train) + len(reply_emb_test) == data_count
    assert len(post_emb_train) + len(post_emb_test) == data_count // 2

In [None]:
%%time
train_labels, test_labels = get_labels('data/train_labels.pkl'
                                       , 'data/' + test_file + '_labels.pkl'
                                       , train['sarcasm'], test['sarcasm'])

In [None]:
train_dims = get_length_tensor(reply_emb_train, post_emb_train)
test_dims = get_length_tensor(reply_emb_test, post_emb_test)

In [None]:
_ = plt.hist(train_dims[1], bins=100, range=[0, 50])

In [None]:
train_post_emb = torch.LongTensor(len(post_emb_train), max_post_len)
train_reply_emb = torch.LongTensor(len(reply_emb_train), max_post_len)

In [None]:
train_post_emb = torch.LongTensor(10, max_post_len).zero_()
for i in range(len(train_post_emb)):
    end = len(post_emb_train[i])
    train_post_emb[i][0:end] = post_emb_train[i]

In [None]:
len(train_reply_emb)

In [None]:
len(reply_emb_test)