In [55]:
from parseit.data import load_pickle, save_pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from functools import lru_cache

pd.options.display.min_rows = 650
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20

# ranksnl stopwords (large set) was chosen; read about stopwords here (and why nltk stopwords should not be chosen):
# https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# https://www.aclweb.org/anthology/W18-2502/
def stopwords():
    filename = os.path.join(os.getcwd(), "datasets", "ranksnl_large.csv")
    stop_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                stop_words.append(word.lower().strip())
    return set(stop_words)

def ignoretokens():
    tokens = [',', '.', ';', ':', '"', '``', "''", '`', '[removed]', '>', '*', '_', "&", "$", "!", "#", "%", "'", "”", "“", "’", "‘", "―", "—", "~", "–", "+", "/", "-", "\\", "(", ")", " ", "  " , "\n", "\t", "\r\n", "\r", "	"]
    return ''.join(tokens)

# Implement our own tokenizer compatible with sklearn; we want to be able to define stopwords
class LemmaTokenizer:
    ignoretokens = ""
    stopwords = []
    def __init__(self, stopwords=[], ignoretokens=""):
        self.wnl = WordNetLemmatizer()
        self.stopwords = stopwords
        self.ignoretokens = ignoretokens
    def __call__(self, document):
        sig_words = []
        for word, tag in pos_tag(word_tokenize(document)):
            lower_cased_tag = tag[0].lower()
            word = word.strip(self.ignoretokens)
            wn_tag = lower_cased_tag if lower_cased_tag in ['a', 'r', 'n', 'v'] else None
            if not wn_tag:
                lemma = word
            else:
                lemma = self.wnl.lemmatize(word, wn_tag)
            if lemma not in list(self.stopwords):
                sig_words.append(lemma.lower())
        return sig_words

In [56]:
pickle_file_name = "comments"
pickle_df = load_pickle(f"{pickle_file_name}.p")
pickle_df = pickle_df[pickle_df["body"] != "[removed]"]
pickle_df = pickle_df.reset_index()

In [62]:
df = pickle_df.copy()

swords = stopwords()
itokens = ignoretokens()

# bag of words
cvb = CountVectorizer(stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
bow_data = cvb.fit_transform(df.get("body"))
bow_df = pd.DataFrame(bow_data.toarray(), columns=cvb.get_feature_names())

print(f"Features (bow): {len(cvb.get_feature_names())}")
df_bow = pd.concat([df, bow_df], axis=1, sort=False)


# bigram of words
cvbi = CountVectorizer(ngram_range=(2,2), stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
bobi_data = cvbi.fit_transform(df.get("body"))
bobi_df = pd.DataFrame(bobi_data.toarray(), columns=cvbi.get_feature_names())

print(f"Features (bobigrams): {len(cvbi.get_feature_names())}")
df_bobi = pd.concat([df, bobi_df], axis=1, sort=False)

#df_bow
#df_bowi


Features (bow): 5021
Features (bobigrams): 18924
