In [8]:
from parseit.data import load_pickle, save_pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from functools import lru_cache

pd.options.display.min_rows = 650
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 40

# ranksnl stopwords (large set) was chosen; read about stopwords here (and why nltk stopwords should not be chosen):
# https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# https://www.aclweb.org/anthology/W18-2502/
def stopwords():
    filename = os.path.join(os.getcwd(), "datasets", "ranksnl_large.csv")
    stop_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                stop_words.append(word.lower().strip())
    return set(stop_words)

def ignoretokens():
    common_fractals = ["1/2", "1/3", "1/4"]
    low_numbers = [str(int) for int in range(0,10)]
    mid_numbers = [str(int) for int in range(10,100,10)]
    high_numbers = [str(int) for int in range(100,100100,100)]
    tokens = [',', '.', ';', ':', '"', '``', "''", '`', '[removed]', '>', '*', '_', "&", "$", "!", "#", "%", "'", "”", "“", "’", "‘", "―", "—", "~", "–", "+", "/", "-", "\\", "(", ")", " ", "  ", "   ", "    ", "\n", "\t", "\r\n", "\r", "	", "<", ">", "?", "label", "="] + low_numbers + mid_numbers + high_numbers
    return ''.join(tokens)

# Implement our own tokenizer compatible with sklearn; we want to be able to define stopwords
class LemmaTokenizer:
    ignoretokens = ""
    stopwords = []
    def __init__(self, stopwords=[], ignoretokens=""):
        self.wnl = WordNetLemmatizer()
        self.stopwords = stopwords
        self.ignoretokens = ignoretokens
    def __call__(self, document):
        sig_words = []
        for word, tag in pos_tag(word_tokenize(document)):
            lower_cased_tag = tag[0].lower()
            word = word.strip(self.ignoretokens)
            wn_tag = lower_cased_tag if lower_cased_tag in ['a', 'r', 'n', 'v'] else None
            if not wn_tag:
                lemma = word
            else:
                lemma = self.wnl.lemmatize(word, wn_tag)
            if lemma not in list(self.stopwords):
                sig_words.append(lemma.lower())
        return sig_words

In [9]:
pickle_file_name = "comments"
pickle_df = load_pickle(f"{pickle_file_name}.p")
pickle_df = pickle_df[pickle_df["body"] != "[removed]"]
pickle_df = pickle_df.reset_index()

In [12]:
df = pickle_df.copy()

# Import stopwords and ignoretokens
swords = stopwords()
itokens = ignoretokens()

# bag of words
def create_bows_pickle(df):
    cvb = CountVectorizer(stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
    bow_data = cvb.fit_transform(df.get("body"))
    bow_df = pd.DataFrame(bow_data.toarray(), columns=cvb.get_feature_names())
    print(f"Number of features (words): {len(cvb.get_feature_names())}")
    df = pd.concat([df, bow_df], axis=1, sort=False)
    return df, "bows"


# bag of bigrams
def create_bigrams_pickle(df):
    cvbi = CountVectorizer(ngram_range=(2,2), stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
    bobi_data = cvbi.fit_transform(df.get("body"))
    bobi_df = pd.DataFrame(bobi_data.toarray(), columns=cvbi.get_feature_names())
    print(f"Number of features (bigrams): {len(cvbi.get_feature_names())}")
    df = pd.concat([df, bobi_df], axis=1, sort=False)
    return df, "bigrams"


# bag of TFIDF
def create_tfidf_pickle(df):
    tfidfv = TfidfVectorizer(stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
    tfidf_data = tfidfv.fit_transform(df.get("body")) # this is a numpy sparse matrix
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_data, columns=tfidfv.get_feature_names()) # we can create a datafame from a sparse matrix directly 
    print(f"Number of features (tfidf): {len(tfidfv.get_feature_names())}")
    df = pd.concat([df, tfidf_df], axis=1, sort=False)
    return df, "tfidf"


# Uncomment one of below to save features to a new pickle:

df, filename = create_bows_pickle(df)
#df, filename = create_bigrams_pickle(df)
#df, filename = create_tfidf_pickle(df)

# -------

save_pickle(df, f"{pickle_file_name}-{filename}.p")
print(f"Pickle for {filename} saved.")

df



Number of features (words): 4172
Pickle for bows saved.


Unnamed: 0,index,body,subreddit,submission,label,Unnamed: 6,cabin,cable,cacy,cades,cage,cajun,calamity,calculat,californi,call,calmly,cals,cambodi,camera,...,zapp,zark,zealan,zen,zing,zip,zombi,zombie,zon,zores,zos,zoufaly,°,♥️,❤️,🎵,🖕trump,🤔,🤦🏻‍♀️,🥰🙌
0,0,This is the canonical explanation for the TP h...,AskReddit,You go to sleep on the 31st of December 2020 a...,20399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Sell all the stocks in February and buy Tesla ...,AskReddit,You go to sleep on the 31st of December 2020 a...,37791,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,"idk, cry, invest, reset, repeat.",AskReddit,You go to sleep on the 31st of December 2020 a...,43314,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,Don't have sex with so-and-so because they hav...,AskReddit,You go to sleep on the 31st of December 2020 a...,9216,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,Start a twitch/youtube channel that predicts 2...,AskReddit,You go to sleep on the 31st of December 2020 a...,45018,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,5,I would go to the doctor in early January to c...,AskReddit,You go to sleep on the 31st of December 2020 a...,10035,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,6,I'm gonna be real sad because I just spent the...,AskReddit,You go to sleep on the 31st of December 2020 a...,31027,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,7,Get a haircut in February.,AskReddit,You go to sleep on the 31st of December 2020 a...,63821,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,8,Tell Nevada to start counting mail in ballots ...,AskReddit,You go to sleep on the 31st of December 2020 a...,12816,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,9,Call my brother since he died in the first wee...,AskReddit,You go to sleep on the 31st of December 2020 a...,8362,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
