In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel, rbf_kernel, polynomial_kernel, laplacian_kernel, cosine_similarity
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from parseit.data import load_pickle, save_pickle
import pandas as pd

pd.options.display.min_rows = 650
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20

# ranksnl stopwords (large set) was chosen; read about stopwords here (and why nltk stopwords should not be chosen):
# https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words
# https://www.aclweb.org/anthology/W18-2502/
def stopwords():
    filename = os.path.join(os.getcwd(), "datasets", "ranksnl_large.csv")
    print(filename)
    stop_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                stop_words.append(word.lower().strip())
    return set(stop_words)

# Not the same ignore tokens as in nb-features - because we want to check for smilies here
def ignoretokens():
    common_fractals = ["1/2", "1/3", "1/4"]
    low_numbers = [str(int) for int in range(0,10)]
    mid_numbers = [str(int) for int in range(10,100,10)]
    high_numbers = [str(int) for int in range(100,100100,100)]
    tokens = [',', '.', '"', '``', "''", '`', '*', '_', "&", "$", "!", "#", "%", "'", "”", "“", "’", "‘", "―", "—", "~", "–", "/", "  ", "   ", "    ", "\n", "\t", "\r\n", "\r", "	", "?"] + low_numbers + mid_numbers + high_numbers
    return tokens

# There's many lists for bad words (for spam deteciton, moderation, etc). This is one of the largest I've found so far:
# https://www.freewebheaders.com/bad-words-list-and-page-moderation-words-list-for-facebook/
def badwords():
    filename = os.path.join(os.getcwd(), "datasets", "fb-bad-words.csv")
    bad_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                bad_words.append(word.lower().strip())
    return set(bad_words)

def commonwords():
    import os
    filename = os.path.join(os.getcwd(), "datasets", "common-words.csv")
    common_words = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                common_words.append(word.lower().strip())
    return set(common_words)


def names():
    import os
    root_path = f"/home/halpdesk/CODE/reddit-parser"
    filename = os.path.join(os.getcwd(), "datasets", "first_names_all.csv")
    first_names = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                first_names.append(word.strip())
    filename = os.path.join(os.getcwd(), "datasets", "last_names_all.csv")
    last_names = []
    with open(filename) as f:
        for line in f:
            for word in line.split(","):
                last_names.append(word.strip())
        
    return set(first_names + last_names)

# From https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0144296:
# http://kt.ijs.si/data/Emoji_sentiment_ranking/
# Other: https://research.utwente.nl/files/5482763/sac13-senticon.pdf
# http://emojitracker.com/
def emoticons():
    filename = os.path.join(os.getcwd(), "datasets", "emoticons.csv")
    emoticons = {
        "pos": [":)", ":))", ":D", ":DD", "xD", "xDD", ":d" "=)", "=))", ":')", "=')", ":}", ":}}", ":]", ":]]", "(:", "C:", ":P", ":>", "<:"],
        "neu": [":|", ":/", ":\\", ""],
        "neg": [":(", ":((", ":(((", ":((((", ":C", ":CC", ":'C", "=(", "=((", ":'(", "='(", ":[", ":{", "):", ":<", ">:"]
    }
    with open(filename) as f:
        for line in f:
            word = line.split(",")
            smil = word[0]
            #unicodesmil = chr(int(word[1], 0))
            neg = float(word[3])
            neu = float(word[4])
            pos = float(word[5])
            if neu > pos and neu > neg:
                emoticons["neu"].append(smil)
            elif pos > neg:
                emoticons["pos"].append(smil)
                #emoticons["pos"].append(unicodesmil)
            else:
                emoticons["neg"].append(smil)
    return emoticons

# Implement our own tokenizer compatible with sklearn; we want to be able to define stopwords and vocabulary

class LemmaTokenizer:
    ignoretokens = []
    stopwords = []
    vocab = []
    lcase = True
    def __init__(self, stopwords=[], vocabulary=[], ignoretokens=[], lcase=True):
        self.wnl = WordNetLemmatizer()
        self.stopwords = stopwords
        self.vocab = vocabulary
        self.ignoretokens = ignoretokens
        self.lcase = lcase
    def __call__(self, document):
        lemmas = []
        tokenized_words = document.split(" ")
        tokenized_words = list(filter(lambda x: x != "", tokenized_words))
        for word, tag in pos_tag(tokenized_words):
            lower_cased_tag = tag[0].lower()
            stripped_word = word.strip(''.join(self.ignoretokens))
            if (word != stripped_word):
                #print(f"{word} -> {stripped_word}")
                pass
            wn_tag = lower_cased_tag if lower_cased_tag in ['a', 'r', 'n', 'v'] else None
            if not wn_tag:
                lemma = stripped_word
            else:
                lemma = self.wnl.lemmatize(stripped_word, wn_tag)
            if lemma not in list(self.stopwords): # and word in self.vocab:
                lemmas.append(lemma.lower() if self.lcase else lemma)
        return lemmas

In [2]:
pickle_file_name = "data-16k-dec-3"
#pickle_file_name = "comments"
pickle_df = load_pickle(f"{pickle_file_name}.p")
print(f"This pickle has {len(pickle_df)} comments")
pickle_df = pickle_df[pickle_df["body"] != "[removed]"]
pickle_df = pickle_df[pickle_df["body"] != "[deleted]"]
pickle_df = pickle_df[pickle_df["body"] != ""].reset_index()

This pickle has 16840 comments


In [3]:
df = pickle_df.copy()

swords = stopwords()
bwords = badwords()
smil = emoticons()
cwords = commonwords()
itokens = ignoretokens()
nam = names()

#display(df.get("body")[507])
#display(df.get("body")[508])
#display(df.get("body")[509])
#display(df.get("body")[510])
#display(df.get("body")[511])

s_length = len(df.index)
submissions = set(df.get("submission"))
print(f"{s_length} comments in {len(submissions)} submissions")

# All vocabs
cva = CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer())
asdf = cva.fit_transform(df.get("body")[508:])
vocabs = cva.get_feature_names()
print("Created vocabs")

# Filtered vocabs
cvf = CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer(stopwords=list(swords)+list(bwords)))
asdf = cvf.fit_transform(df.get("body"))
filtered_vocabs = cvf.get_feature_names()
print("Created filtered vocabs")

# Significant words count
cv = CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer(stopwords=list(swords)+list(bwords), ignoretokens=itokens))
wc_data = cv.fit_transform(df.get("body"))
wc_df = pd.DataFrame(wc_data.sum(axis=1))
wc_df.columns = ["wc"]
print("Feature: Word count [DONE]")

# Bad words count
cv = CountVectorizer(vocabulary=bwords, stop_words=None, lowercase=True, tokenizer=LemmaTokenizer(), ngram_range=(1,2)) # finds "old bag"
bw_data = cv.fit_transform(df.get("body"))
bw_df = pd.DataFrame(bw_data.sum(axis=1))
bw_df.columns = ["bw"]
print("Feature: Bad words [DONE]")

# Stop words count
cv = CountVectorizer(vocabulary=swords, stop_words=None, lowercase=True, tokenizer=LemmaTokenizer())
sw_data = cv.fit_transform(df.get("body"))
sw_df = pd.DataFrame(sw_data.sum(axis=1))
sw_df.columns = ["sw"]
print("Feature: Stop words [DONE]")

# Names count
cv = CountVectorizer(vocabulary=nam, stop_words=None, lowercase=False, tokenizer=LemmaTokenizer(lcase=False))
nam_data = cv.fit_transform(df.get("body"))
nam_df = pd.DataFrame(nam_data.sum(axis=1))
nam_df.columns = ["nam"]
print("Feature: Names [DONE]")

# Positive smilies
cv = CountVectorizer(vocabulary=smil["pos"], analyzer="char", ngram_range=(1,2), stop_words=None, lowercase=False) # char + 2 ngram = ":D"
smilp_data = cv.fit_transform(df.get("body"))
smilp_df = pd.DataFrame(smilp_data.sum(axis=1))
smilp_df.columns = ["smil+"]
print("Feature: positive smiles [DONE]")

# Negative smilies count
cv = CountVectorizer(vocabulary=smil["neg"], analyzer="char", ngram_range=(1,2), stop_words=None, lowercase=False) # char + 2 ngram = ":D"
sniln_data = cv.fit_transform(df.get("body"))
smiln_df = pd.DataFrame(sniln_data.sum(axis=1))
smiln_df.columns = ["smil-"]
print("Feature: Negative smiles [DONE]")

# Neutral smilies count
cv = CountVectorizer(vocabulary=smil["neu"], analyzer="char", ngram_range=(1,2), stop_words=None, lowercase=False) # char + 2 ngram = ":D"
smile_data = cv.fit_transform(df.get("body"))
smile_df = pd.DataFrame(smile_data.sum(axis=1))
smile_df.columns = ["smil&"]
print("Feature: Neutral smiles [DONE]")


# TF-IDF cosine similarity toawrd topic
df["top-cos-sim"] = pd.Series(np.zeros(s_length), index=df.index)
for submission in submissions:
    sub_df = df[df.submission == submission][["body"]]
    submission_with_comments = [submission] + list(sub_df.get("body").array)
    tfidfv = TfidfVectorizer(vocabulary=filtered_vocabs, lowercase=True, ngram_range=(1,1), smooth_idf=True, tokenizer=LemmaTokenizer(stopwords=list(swords)))
    tfidf_data = tfidfv.fit_transform(submission_with_comments)
    cosine_similarities = cosine_similarity(tfidf_data[0:1], tfidf_data[1:]).flatten()
    top_simi_df = pd.DataFrame(cosine_similarities, index=sub_df.index)
    top_simi_df.columns = ["top-cos-sim"]
    sub_df = pd.concat([sub_df, top_simi_df], axis=1)
    df.update(sub_df)
print("Feature: Cosine similarity with topic [DONE]")


# TF-IDF cosine similarity towards all documents in a submission
df["cos-sim"] = pd.Series(np.zeros(s_length), index=df.index)
for submission in submissions:
    sub_df = df[df.submission == submission][["body"]]
    submission_with_comments = [" ".join(vocabs)] + list(sub_df.get("body").array)
    tfidfv = TfidfVectorizer(vocabulary=filtered_vocabs, lowercase=True, ngram_range=(1,1), smooth_idf=True, tokenizer=LemmaTokenizer(stopwords=list(swords)))
    tfidf_data = tfidfv.fit_transform(submission_with_comments)
    cosine_similarities = cosine_similarity(tfidf_data[0:1], tfidf_data[1:]).flatten()
    all_simi_df = pd.DataFrame(cosine_similarities, index=sub_df.index)
    all_simi_df.columns = ["cos-sim"]
    sub_df = pd.concat([sub_df, all_simi_df], axis=1)
    df.update(sub_df)
print("Feature: Cosine similarity with rest [DONE]")


# TF-IDF mean value (checks across all documents in a submission)
df["tfidf-mean"] = pd.Series(np.zeros(s_length), index=df.index)
for submission in submissions:
    sub_df = df[df.submission == submission][["body"]]
    tfidfv = TfidfVectorizer(vocabulary=vocabs, lowercase=True, ngram_range=(1,1), smooth_idf=True, tokenizer=LemmaTokenizer(stopwords=list(swords)))
    tfidf_data = tfidfv.fit_transform(sub_df.get("body")).todense()
    means = [0]*tfidf_data.shape[0]
    for i in range(0, tfidf_data.shape[0]):
        word_count = wc_df.get("wc")[i] + bw_df.get("bw")[i] + sw_df.get("sw")[i]
        means[i] = tfidf_data[i].sum()/word_count
    tfidf_df = pd.DataFrame(means, index=sub_df.index)
    tfidf_df.columns = ["tfidf-mean"]
    sub_df = pd.concat([sub_df, tfidf_df], axis=1)
    df.update(sub_df)
print("Feature: Mean TF-IDF [DONE]")


# Has link
df['lnk'] = [1 if "http" in row[['body']].to_string() else 0 for i,row in df.iterrows()]
print("Feature: Link [DONE]")

# Update the the rest
df = pd.concat([df, wc_df, sw_df, bw_df, nam_df, smilp_df, smiln_df, smile_df], axis=1)
print("Updated word counts")

filename = "other-features"
save_pickle(df, f"{pickle_file_name}-{filename}.p")
print(f"Pickle for {filename} saved.")

df



/home/halpdesk/CODE/reddit-parser/datasets/ranksnl_large.csv
16840 comments in 1233 submissions
Created vocabs
Created filtered vocabs
Feature: Word count [DONE]
Feature: Bad words [DONE]
Feature: Stop words [DONE]
Feature: Names [DONE]
Feature: positive smiles [DONE]
Feature: Negative smiles [DONE]
Feature: Neutral smiles [DONE]
Feature: Cosine similarity with topic [DONE]
Feature: Cosine similarity with rest [DONE]
Feature: Mean TF-IDF [DONE]
Feature: Link [DONE]
Updated word counts
Pickle for other-features saved.


Unnamed: 0,index,body,subreddit,submission,label,top-cos-sim,cos-sim,tfidf-mean,lnk,wc,sw,bw,nam,smil+,smil-,smil&
0,0,*What if Jod was one of us?*,askreddit,"What if God came down one day and said ""It's p...",429,0.000000,0.001759,0.200000,0,1,4,0,0,0,0,0
1,1,A large chunk of my taking the lord's name in ...,askreddit,"What if God came down one day and said ""It's p...",41259,0.000000,0.005759,0.153846,0,4,9,0,0,0,0,0
2,2,Do you want a Holy War? Because that's how you...,askreddit,"What if God came down one day and said ""It's p...",26069,0.000000,0.005984,0.136651,0,5,9,0,2,0,0,0
3,3,"He would also say, “my name isn’t God” then he...",askreddit,"What if God came down one day and said ""It's p...",6449,0.076923,0.007537,0.151364,0,5,11,0,1,0,0,0
4,4,"We’d be like, “No way!” and he’d be like, “Yah...",askreddit,"What if God came down one day and said ""It's p...",20046,0.111522,0.007880,0.224878,0,6,6,0,0,0,0,0
5,5,I would be converted\n\n\nEdit: thanks for the...,askreddit,"What if God came down one day and said ""It's p...",6577,0.000000,0.003105,0.128565,0,4,7,0,0,0,0,0
6,6,What if Satan came down one day and said ''It'...,askreddit,"What if God came down one day and said ""It's p...",5263,0.501827,0.004321,0.140944,0,6,8,0,0,0,0,0
7,7,Then all of Scandinavia will pronounce it like...,askreddit,"What if God came down one day and said ""It's p...",15799,0.136062,0.005027,0.112265,0,8,9,0,1,0,0,0
8,8,You'd have different religious factions arguin...,askreddit,"What if God came down one day and said ""It's p...",15158,0.000000,0.012257,0.163299,0,7,8,0,0,0,0,0
9,9,Does that mean Jesus would be pronounced Geezus?,askreddit,"What if God came down one day and said ""It's p...",22978,0.142536,0.002640,0.174018,0,3,5,0,1,0,0,0
