In [1]:
from parseit.data import load_pickle, save_pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from functools import lru_cache

pd.options.display.min_rows = 650
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 40

# ranksnl stopwords (large set) was chosen; read about stopwords here (and why nltk stopwords should not be chosen):
# https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# https://www.aclweb.org/anthology/W18-2502/
def stopwords():
    filename = os.path.join(os.getcwd(), "datasets", "ranksnl_large.csv")
    stop_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                stop_words.append(word.lower().strip())
    return set(stop_words)

def ignoretokens():
    common_fractals = ["1/2", "1/3", "1/4"]
    low_numbers = [str(int) for int in range(0,10)]
    mid_numbers = [str(int) for int in range(10,100,10)]
    high_numbers = [str(int) for int in range(100,100100,100)]
    tokens = [',', '.', ';', ':', '"', '``', "''", '`', '*', '_', "&", "$", "!", "#", "%", "'", "”", "“", "’", "‘", "―", "—", "~", "–", "+", "/", "-", "\\", "(", ")", " ", "  ", "   ", "    ", "\n", "\t", "\r\n", "\r", "	", "<", ">", "?", "label", "="] + low_numbers + mid_numbers + high_numbers
    return ''.join(tokens)


def commonwords():
    filename = os.path.join(os.getcwd(), "datasets", "common-words.csv")
    common_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                common_words.append(word.lower().strip())
    return set(common_words)

# Implement our own tokenizer compatible with sklearn; we want to be able to define stopwords
class LemmaTokenizer:
    ignoretokens = ""
    stopwords = []
    def __init__(self, stopwords=[], ignoretokens=""):
        self.wnl = WordNetLemmatizer()
        self.stopwords = stopwords
        self.ignoretokens = ignoretokens
    def __call__(self, document):
        sig_words = []
        for word, tag in pos_tag(word_tokenize(document)):
            lower_cased_tag = tag[0].lower()
            word = word.strip(self.ignoretokens)
            wn_tag = lower_cased_tag if lower_cased_tag in ['a', 'r', 'n', 'v'] else None
            if not wn_tag:
                lemma = word
            else:
                lemma = self.wnl.lemmatize(word, wn_tag)
            if lemma not in list(self.stopwords):
                sig_words.append(lemma.lower())
        return sig_words

In [2]:
pickle_file_name = "data-16k-dec-3"
#pickle_file_name = "comments"
pickle_df = load_pickle(f"{pickle_file_name}.p")
print(f"This pickle has {len(pickle_df)} comments")

This pickle has 16840 comments


In [6]:
df = pickle_df.copy()

# Import stopwords and ignoretokens
swords = stopwords()
itokens = ignoretokens()
cwords = commonwords()

# Vocabulary
cva = CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer())
asdf = cva.fit_transform(list(cwords))
vocabs10k = cva.get_feature_names()
#print(vocabs10k)

# bag of words
def create_bows_pickle(df):
    cvb = CountVectorizer(stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
    bow_data = cvb.fit_transform(df.get("body"))
    bow_df = pd.DataFrame(bow_data.toarray(), columns=cvb.get_feature_names())
    print(f"Number of features (words): {len(cvb.get_feature_names())}")
    df = pd.concat([df, bow_df], axis=1, sort=False)
    return df, "bows"


# bag of bigrams
def create_bigrams_pickle(df):
    cvbi = CountVectorizer(ngram_range=(2,2), stop_words=swords)
    bobi_data = cvbi.fit_transform(df.get("body"))
    bobi_df = pd.DataFrame(bobi_data.toarray(), columns=cvbi.get_feature_names())
    print(f"Number of features (bigrams): {len(cvbi.get_feature_names())}")
    df = pd.concat([df, bobi_df], axis=1, sort=False)
    return df, "bigrams"


# bag of bigrams - 10000 common
def create_bigrams_6000_pickle(df):
    cvbi = CountVectorizer(ngram_range=(2,2), stop_words=swords, max_features=6000)
    #print(cvbi.get_feature_names())
    bobi_data = cvbi.fit_transform(df.get("body"))
    bobi_df = pd.DataFrame(bobi_data.toarray(), columns=cvbi.get_feature_names())
    print(f"Number of features (bigrams): {len(cvbi.get_feature_names())}")
    df = pd.concat([df, bobi_df], axis=1, sort=False)
    return df, "bigrams-6000"


# bag of TFIDF
def create_tfidf_pickle(df):
    tfidfv = TfidfVectorizer(stop_words=swords, tokenizer=LemmaTokenizer(stopwords=list(swords), ignoretokens=itokens))
    tfidf_data = tfidfv.fit_transform(df.get("body")) # this is a numpy sparse matrix
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_data, columns=tfidfv.get_feature_names()) # we can create a datafame from a sparse matrix directly 
    print(f"Number of features (tfidf): {len(tfidfv.get_feature_names())}")
    df = pd.concat([df, tfidf_df], axis=1, sort=False)
    return df, "tfidf"


# Uncomment one of below to save features to a new pickle:

#new_df, filename = create_bows_pickle(df)
#new_df, filename = create_bigrams_pickle(df)
#new_df, filename = create_bigrams_6000_pickle(df)
new_df, filename = create_tfidf_pickle(df)

display(new_df)

# -------

save_pickle(new_df, f"{pickle_file_name}-{filename}.p")
print(f"Pickle for {filename} saved.")





Number of features (tfidf): 29211


Unnamed: 0,body,subreddit,submission,label,Unnamed: 5,,@,[,],^,^^,^_^,^golf,^mor,^nic,^t,c'est,c'mon,c-list,c-section,...,🙄,🙄🙄🙄🙄,🙌👏,🤔,🤘,🤣,🤦‍♂️,🤦🏻‍♀️,🤦🏻‍♂️,🤮,🤯,🤷,🤷‍♀️,🤷🏻‍♀️,🥰,🥳,🥳🥳🥳,🥴,🥺,🥺😭🥺😭🥺💕
0,*What if Jod was one of us?*,askreddit,"What if God came down one day and said ""It's p...",429,0.327539,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A large chunk of my taking the lord's name in ...,askreddit,"What if God came down one day and said ""It's p...",41259,0.156054,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Do you want a Holy War? Because that's how you...,askreddit,"What if God came down one day and said ""It's p...",26069,0.253232,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"He would also say, “my name isn’t God” then he...",askreddit,"What if God came down one day and said ""It's p...",6449,0.324086,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"We’d be like, “No way!” and he’d be like, “Yah...",askreddit,"What if God came down one day and said ""It's p...",20046,0.746708,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,I would be converted\n\n\nEdit: thanks for the...,askreddit,"What if God came down one day and said ""It's p...",6577,0.249733,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,What if Satan came down one day and said ''It'...,askreddit,"What if God came down one day and said ""It's p...",5263,0.177232,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Then all of Scandinavia will pronounce it like...,askreddit,"What if God came down one day and said ""It's p...",15799,0.198076,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,You'd have different religious factions arguin...,askreddit,"What if God came down one day and said ""It's p...",15158,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Does that mean Jesus would be pronounced Geezus?,askreddit,"What if God came down one day and said ""It's p...",22978,0.144981,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Pickle for tfidf saved.
