In [None]:
import os

from scipy.stats import spearmanr, ttest_ind

path = os.path.realpath(os.path.join('..', '..'))
os.chdir(path)

from src.preprocessing.datahandler import DataHandler
from spacy.lang.en.stop_words import STOP_WORDS
from src.tools.config import Config
from spacy.tokens import Token
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import spacy


In [None]:
data_dir = Path(Config.path.data_folder)
sw_file_full = str(data_dir / 'stop_words_full_ultra.txt')
sw_file_cut = str(data_dir / 'stop_words_cut_ultra.txt')

In [None]:
nlp = spacy.load('en_core_web_md')
dh = DataHandler()
dh.load_train_test(str(data_dir))
train = dh.get_train_df(deep_copy=False)

In [None]:
train['post'] = train['post'].map(lambda s: s.replace('&amp;', ' and '))
train['post'] = train['post'].map(lambda s: s.replace('amp;', ' and '))
train['reply'] = train['reply'].map(lambda s: s.replace('&amp;', ' and '))
train['reply'] = train['reply'].map(lambda s: s.replace('amp;', ' and '))

In [None]:
train['post'] = train['post'].map(lambda s: s.replace('&nbsp;', ' '))
train['post'] = train['post'].map(lambda s: s.replace('nbsp;', ' '))
train['reply'] = train['reply'].map(lambda s: s.replace('&nbsp;', ' '))
train['reply'] = train['reply'].map(lambda s: s.replace('nbsp;', ' '))

In [None]:
stop_words_ultra = set(
    ["n't", "'s", "'m", "'re", "'ve", "'ll", "'d", "a", "a's", "able", "about", "above", "abroad", "acc", "acc.",
     "according"
        , "accordingly", "across", "actually", "ad.", "after", "afterwards", "again"
        , "against", "ago", "ah", "aha", "ahead", "ain't", "all", "allow", "allows"
        , "almost", "alone", "along", "alongside", "already", "also", "although", "always"
        , "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and"
        , "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways"
        , "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren't"
        , "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away"
        , "awfully", "b", "back", "backward", "backwards", "be", "became", "because"
        , "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "behind"
        , "being", "believe", "below", "beside", "besides", "best", "better", "between"
        , "beyond", "bill", "both", "bottom", "brief", "but", "by", "c", "c'mon", "c's"
        , "call", "came", "can", "can't", "cannot", "cant", "caption", "cause", "causes"
        , "certain", "certainly", "cetera", "changes", "clearly", "co", "co-", "co.", "com"
        , "come", "comes", "computer", "con", "concerning", "consequently", "consider"
        , "considering", "contain", "containing", "contains", "corresponding", "could"
        , "could've", "couldn't", "couldnt", "course", "cry", "currently", "d", "dare"
        , "daren't", "de", "dear", "definitely", "describe", "described", "despite", "detail"
        , "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "don"
        , "don't", "done", "down", "downwards", "due", "during", "e", "e.g.", "each", "edu"
        , "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end"
        , "ending", "enough", "entirely", "especially", "est", "et", "etc", "etc.", "etcetera"
        , "even", "ever", "evermore", "every", "everybody", "everyone", "everything"
        , "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther"
        , "few", "fewer", "fifteen", "fifth", "fifty", "fify", "fill", "find", "fire", "first"
        , "five", "followed", "following", "follows", "for", "forever", "former", "formerly"
        , "forth", "forty", "forward", "found", "four", "from", "front", "full", "further"
        , "furthermore", "g", "get", "gets", "getting", "give", "given", "gives", "go", "goes"
        , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't", "half"
        , "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "having", "he"
        , "he'd", "he'll", "he's", "hello", "help", "hence", "her", "here", "here's"
        , "hereafter", "hereby", "herein", "hereupon", "hers", "herse", "herself", "hi"
        , "him", "himse", "himself", "his", "hither", "hm", "hmm", "hmmm", "hopefully"
        , "how", "how'd", "how'll", "how's", "howbeit", "however", "hundred", "i", "i.e."
        , "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "immediate", "in"
        , "inasmuch", "inc", "inc.", "indeed", "indicate", "indicated", "indicates", "inner"
        , "inside", "insofar", "instead", "interest", "into", "inward", "is", "isn't", "it"
        , "it'd", "it'll", "it's", "its", "itse", "itself", "j", "just", "k", "keep"
        , "keeps", "kept", "know", "known", "knows", "l", "last", "lately", "later", "latter"
        , "latterly", "least", "less", "lest", "let", "let's", "like", "liked", "likely"
        , "likewise", "little", "ll", "look", "looking", "looks", "low", "lower", "ltd"
        , "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn't", "me"
        , "mean", "meantime", "meanwhile", "merely", "might", "might've", "mightn't"
        , "mill", "mine", "minus", "miss", "more", "moreover", "most", "mostly", "move"
        , "mr", "mrs", "much", "must", "must've", "mustn't", "my", "myse", "myself", "n"
        , "name", "namely", "nd", "near", "nearly", "necessary", "need", "needn't", "needs"
        , "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine"
        , "ninety", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor"
        , "normally", "not", "nothing", "notwithstanding", "novel", "now", "nowhere", "o"
        , "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one"
        , "one's", "ones", "only", "onto", "opposite", "or", "other", "others", "otherwise"
        , "ought", "oughtn't", "our", "ours", "ourselves", "out", "outside", "over", "overall"
        , "own", "p", "part", "particular", "particularly", "past", "per", "perhaps", "placed"
        , "please", "plus", "possible", "presumably", "probably", "provided", "provides", "put"
        , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really", "reasonably"
        , "recent", "recently", "regarding", "regardless", "regards", "relatively"
        , "respectively", "right", "round", "s", "said", "same", "saw", "say", "saying"
        , "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming"
        , "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously"
        , "seven", "several", "shall", "shan", "shan't", "she", "she'd", "she'll", "she's"
        , "should", "should've", "shouldn", "shouldn't", "show", "side", "since", "sincere"
        , "six", "sixty", "so", "some", "somebody", "someday", "somehow", "someone"
        , "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
        , "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure"
        , "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th"
        , "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've"
        , "thats", "the", "thee", "their", "theirs", "them", "themselves", "then", "thence"
        , "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter"
        , "thereby", "therefore", "therein", "theres", "thereupon", "these", "they"
        , "they'd", "they'll", "they're", "they've", "thick", "thin", "thing", "things"
        , "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou"
        , "though", "three", "through", "throughout", "thru", "thus", "thx", "thy", "till"
        , "tis", "to", "together", "too", "took", "top", "toward", "towards", "tried", "tries"
        , "truly", "try", "trying", "twas", "twelve", "twenty", "twice", "two", "u", "un"
        , "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely"
        , "until", "unto", "up", "upon", "upwards", "us", "use", "used", "useful", "uses"
        , "using", "usually", "uucp", "v", "value", "various", "ve", "versus", "very", "via"
        , "viz", "vs", "w", "want", "wants", "was", "wasn't", "way", "we", "we'd", "we'll"
        , "we're", "we've", "welcome", "well", "went", "were", "weren't", "what", "what'd"
        , "what'll", "what's", "what've", "whatever", "when", "when'd", "when'll", "when's"
        , "whence", "whenever", "where", "where'd", "where'll", "where's", "whereafter"
        , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which"
        , "whichever", "while", "whilst", "whither", "who", "who'd", "who'll", "who's"
        , "whoever", "whole", "whom", "whomever", "whose", "why", "why'd", "why'll", "why's"
        , "will", "willing", "wish", "with", "within", "without", "won't", "wonder", "would"
        , "would've", "wouldn't", "x", "y", "yeah", "yes", "yet", "you", "you'd", "you'll"
        , "you're", "you've", "your", "yours", "yourself", "yourselves", "z", "zero"])

In [None]:
STOP_WORDS = STOP_WORDS.union(stop_words_ultra)

In [None]:
len(STOP_WORDS)

In [None]:
stop_words_getter = lambda token: token.is_stop or token.lower_ in STOP_WORDS or token.lemma_ in STOP_WORDS
Token.set_extension('is_stop', getter=stop_words_getter)

In [None]:
df = train[train['sarcasm'] == 1]

In [None]:
%%time
doc_list = [doc for doc in nlp.pipe(df['reply'], n_threads=-1, batch_size=500)]

In [None]:
sarc_stop_words = {}
for word in list(STOP_WORDS):
    sarc_stop_words[word] = 0

In [None]:
sarc_words = 0
for doc in doc_list:
    for token in doc:
        sarc_words += 1
        if token._.is_stop:
            if token.lower_ in sarc_stop_words:
                sarc_stop_words[token.lower_] += 1
            elif token.lemma_ in sarc_stop_words:
                sarc_stop_words[token.lemma_] += 1

In [None]:
df = train[train['sarcasm'] == 0]

In [None]:
%%time
doc_list = [doc for doc in nlp.pipe(df['reply'], n_threads=-1, batch_size=500)]

In [None]:
norm_stop_words = {}
for word in list(STOP_WORDS):
    norm_stop_words[word] = 0

In [None]:
norm_words = 0
for doc in doc_list:
    for token in doc:
        norm_words += 1
        if token._.is_stop:
            if token.lower_ in sarc_stop_words:
                norm_stop_words[token.lower_] += 1
            elif token.lemma_ in sarc_stop_words:
                norm_stop_words[token.lemma_] += 1

In [None]:
word_count = 0
for row in df['reply']:
    for word in row:
        word_count += 1

In [None]:
norm_words

In [None]:
sarc_words

In [None]:
key_list = sarc_stop_words.keys()

In [None]:
sarc = np.asarray([sarc_stop_words[key] for key in key_list], dtype=np.float64)
norm = np.asarray([norm_stop_words[key] for key in key_list], dtype=np.float64)

In [None]:
norm_mean = norm.mean()
sarc_mean = sarc.mean()
norm_std = norm.std()
sarc_std = sarc.std()

In [None]:
sarc_nld = (sarc - sarc.min()) / (sarc.max() - sarc.min())
norm_nld = (norm - norm.min()) / (norm.max() - norm.min())

In [None]:
print("Spearman:  ", spearmanr(sarc, norm))
print("t-test ind:", ttest_ind(sarc, norm))

In [None]:
plt.hist(sarc, bins='auto')
plt.show()

In [None]:
plt.hist(sarc_nld, bins='auto')
plt.show()

In [None]:
plt.hist(norm, bins='auto')
plt.show()

In [None]:
plt.hist(norm_nld, bins='auto')
plt.show()

In [None]:
sn_diff = np.abs((sarc_nld - norm_nld))
plt.hist(sn_diff, bins='auto')
plt.show()

In [None]:
sarc_nld.mean()

In [None]:
norm_nld.mean()

In [None]:
keep_ar = [(True if num > sarc_nld.mean() else False) for num in sn_diff]

In [None]:
norm2 = np.asarray([num for i, num in enumerate(norm) if keep_ar[i]], dtype=np.float64)
sarc2 = np.asarray([num for i, num in enumerate(sarc) if keep_ar[i]], dtype=np.float64)

In [None]:
plt.hist(norm2, bins='auto')
plt.show()

In [None]:
plt.hist(sarc2, bins='auto')
plt.show()

In [None]:
print("Spearman:  ", spearmanr(sarc2, norm2))
print("t-test ind:", ttest_ind(sarc2, norm2))

In [None]:
sn_diff = np.abs((sarc2 - norm2))
plt.hist(sn_diff, bins='auto')
plt.show()

In [None]:
words_to_keep = [key for i, key in enumerate(key_list) if keep_ar[i]]

In [None]:
words_to_keep

In [None]:
np.savetxt(sw_file_full, np.asarray(list(STOP_WORDS)), fmt='%s')

In [None]:
len(STOP_WORDS)

In [None]:
for word in words_to_keep:
    try:
        STOP_WORDS.remove(word)
    except:
        pass

In [None]:
len(STOP_WORDS)

In [None]:
np.savetxt(sw_file_cut, np.asarray(list(STOP_WORDS)), fmt='%s')

In [None]:
norm_stop_words