In [1]:
import re
from collections import Counter
from tqdm.auto import tqdm
from tqdm.notebook import trange, tqdm

In [2]:
tqdm.pandas()

In [121]:
# from https://gist.github.com/whdc/a67e7447e72df94d5fa7851a88124b73
from dataclasses import dataclass
import pandas as pd
import numpy as np

wordcount = pd.read_csv('./data/unigram_freq.csv.zip').set_index('word')['count']

denom = np.log(wordcount.sum())

capboost = 1.

@dataclass
class State:
    score: float
    parse: list[str]

def tokenize(s):
    best = [State(0., [])]

    for j in range(1, len(s)+1):
        best_j = State(-float('inf'), None)
        for i in range(j):
            # Use s[i:j] as a word?

            boost = capboost if s[i].isupper() else 0.
            word = s[i:j].lower()

            if word in wordcount.index:
                newscore = best[i].score + np.log(wordcount.loc[word]) - denom + boost
                if newscore > best_j.score:
                    best_j = State(newscore, best[i].parse + [word])

            if j - i == 1:
                newscore = best[i].score - denom + boost
                if newscore > best_j.score:
                    best_j = State(newscore, best[i].parse + [word])

        best.append(best_j)

    return best[-1].parse

print(tokenize('natureismetal'))
# ['nature', 'is', 'metal']

print(tokenize('penisland'))
# ['penis', 'land']

print(tokenize('penIsland'))
# ['pen', 'island']

['nature', 'is', 'metal']
['penis', 'land']
['pen', 'island']


## Load and parse data

In [122]:
core_1k_df = pd.read_csv("./output/df_070723_1k_with_images_3.csv")

In [123]:
core_1k_df.head(5)

Unnamed: 0.1,Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,subreddit_url,image_url,flourish_img_html,rank_str
0,0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009,135.2 thousand,https://www.reddit.com/r/Home,,,0
1,1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008,41.8 million,https://www.reddit.com/r/AskReddit,,,1
2,2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years,2012,6.0 million,https://www.reddit.com/r/mildlyinfuriating,https://styles.redditmedia.com/t5_2ubgg/styles...,"<img src=""https://styles.redditmedia.com/t5_2u...",2
3,3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years,2010,7.5 million,https://www.reddit.com/r/facepalm,https://styles.redditmedia.com/t5_2r5rp/styles...,"<img src=""https://styles.redditmedia.com/t5_2r...",3
4,4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years,2011,746.5 thousand,https://www.reddit.com/r/diablo4,https://styles.redditmedia.com/t5_2rzx9/styles...,"<img src=""https://styles.redditmedia.com/t5_2r...",4


In [124]:
core_1k_df["stripped_name"] = core_1k_df["name"].apply(lambda x: "'" + x[2:] + "'")
# this is for the page's search box
", ".join(core_1k_df["stripped_name"])

"'Home', 'AskReddit', 'mildlyinfuriating', 'facepalm', 'diablo4', 'therewasanattempt', 'AmItheAsshole', 'Damnthatsinteresting', 'worldnews', 'WhitePeopleTwitter', 'antiwork', 'LivestreamFail', 'NoStupidQuestions', 'gaming', 'leagueoflegends', 'funny', 'AITAH', 'nba', 'pics', 'news', 'doordash', 'PublicFreakout', 'movies', 'UkraineWarVideoReport', 'explainlikeimfive', 'politics', 'pcmasterrace', 'Piracy', 'nextfuckinglevel', 'todayilearned', 'Minecraft', 'soccer', 'Genshin_Impact_Leaks', 'CrazyFuckingVideos', 'Genshin_Impact', 'memes', 'meirl', 'TikTokCringe', 'ukraine', 'Unexpected', 'ChatGPT', 'CombatFootage', 'LifeProTips', 'WTF', 'unpopularopinion', 'buildapc', 'tifu', 'personalfinance', 'shittytattoos', 'relationship_advice', 'Serverlife', 'BestofRedditorUpdates', 'OnePiece', 'anime', 'TwoXChromosomes', 'HonkaiStarRail', 'ffxiv', '2007scape', 'wow', 'wallstreetbets', 'Tinder', 'OldSchoolCool', 'doordash_drivers', 'MadeMeSmile', 'Steam', 'tumblr', 'pcgaming', 'StupidFood', 'legaladv

In [125]:
core_1k_df["tokens"] = core_1k_df["name"].progress_apply(lambda x: tokenize(x))

  0%|          | 0/1000 [00:00<?, ?it/s]

## Count word frequency

In [128]:
def add_to_counter(tokens_list, base_counter):
    new_c = Counter(tokens_list)
    base_counter += new_c

In [129]:
main_counter = Counter()
core_1k_df.progress_apply(lambda x: add_to_counter(x["tokens"], main_counter), axis=1)

  0%|          | 0/1000 [00:00<?, ?it/s]

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Length: 1000, dtype: object

In [130]:
main_counter.most_common(20)

[('r', 1000),
 ('/', 1000),
 ('the', 38),
 ('_', 28),
 ('ask', 22),
 ('i', 18),
 ('memes', 16),
 ('of', 15),
 ('advice', 15),
 ('2', 15),
 ('4', 14),
 ('game', 12),
 ('3', 12),
 ('a', 11),
 ('gaming', 10),
 ('to', 10),
 ('0', 9),
 ('no', 8),
 ('irl', 8),
 ('anime', 8)]

## Remove stopwords

In [131]:
import nltk
from nltk.corpus import stopwords

In [132]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sandrews/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [133]:
stop_words = stopwords.words("english")
# print(stop_words)

In [134]:
stop_words_count = {}
core_words_count = {}
for word in main_counter.keys():
    if word in stop_words:
        stop_words_count[word] = main_counter[word]
    else:
        core_words_count[word] = main_counter[word]

In [135]:
stop_words_count

{'there': 1,
 'was': 1,
 'an': 5,
 'am': 7,
 'i': 18,
 'the': 38,
 'no': 8,
 'of': 15,
 'a': 11,
 'it': 6,
 'me': 6,
 'to': 10,
 'or': 3,
 'up': 1,
 'what': 5,
 'be': 2,
 'off': 2,
 'my': 5,
 'by': 4,
 'too': 1,
 'out': 1,
 'at': 2,
 'is': 5,
 'and': 3,
 'over': 4,
 'from': 2,
 'this': 5,
 'in': 5,
 'as': 4,
 'd': 4,
 'just': 3,
 'into': 1,
 'under': 2,
 'when': 1,
 'that': 1,
 'its': 1,
 's': 3,
 'your': 1,
 'being': 1,
 'for': 3,
 'not': 2,
 'how': 1,
 't': 2,
 'you': 3,
 'should': 3,
 'who': 1,
 'more': 2,
 'all': 1,
 'on': 1,
 'do': 1,
 'why': 1,
 'y': 1,
 'we': 1,
 'are': 1}

In [136]:
sorted_core_words_count = sorted(core_words_count.items(), key=lambda x:x[1], reverse=True)
sorted_core_words_count = [tup for tup in sorted_core_words_count if not (re.search(r"\d", tup[0]) or tup[0] in ("r", "/", "_"))]
sorted_core_words_count

[('ask', 22),
 ('memes', 16),
 ('advice', 15),
 ('game', 12),
 ('gaming', 10),
 ('irl', 8),
 ('anime', 8),
 ('home', 6),
 ('war', 6),
 ('marvel', 6),
 ('competitive', 6),
 ('world', 5),
 ('pc', 5),
 ('k', 5),
 ('life', 5),
 ('finance', 5),
 ('uk', 5),
 ('games', 5),
 ('circle', 5),
 ('pokemon', 5),
 ('girls', 5),
 ('reddit', 4),
 ('news', 4),
 ('people', 4),
 ('twitter', 4),
 ('questions', 4),
 ('legends', 4),
 ('nba', 4),
 ('ukraine', 4),
 ('gen', 4),
 ('cringe', 4),
 ('tips', 4),
 ('one', 4),
 ('two', 4),
 ('kai', 4),
 ('star', 4),
 ('school', 4),
 ('food', 4),
 ('legal', 4),
 ('new', 4),
 ('canada', 4),
 ('go', 4),
 ('watch', 4),
 ('dank', 4),
 ('porn', 4),
 ('sky', 4),
 ('career', 4),
 ('shit', 4),
 ('character', 4),
 ('science', 4),
 ('jerk', 4),
 ('meme', 4),
 ('buy', 4),
 ('linux', 4),
 ('ok', 4),
 ('buddy', 4),
 ('interesting', 3),
 ('work', 3),
 ('stupid', 3),
 ('league', 3),
 ('pics', 3),
 ('report', 3),
 ('master', 3),
 ('race', 3),
 ('fucking', 3),
 ('shin', 3),
 ('leaks', 

In [137]:
with open("./output/most_common_core_words_over_3.csv", "w") as scwcfile:
    scwcfile.write("word, freq\n")
    for x, y in sorted_core_words_count:
        scwcfile.write(f"{x},{y}\n")

## Flourish chart (partial screenshot), created with the data output from above

So many "gam*"-related words.  I guess lemmatization is really going to help here.

<!-- ![Screenshot%202023-07-11%20at%208.52.28%20AM.png](attachment:Screenshot%202023-07-11%20at%208.52.28%20AM.png)
 -->
<img src="./flourish_bar_no_lemmas.png" data-canonical-src="./flourish_bar_no_lemmas.png" width="75%" />

## Lemmatize/Stem then count

### First stemming for the non-vectorizer method above^

In [138]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("fishes"))

fish


Using data from this section => [Count Word Frequency](#Count-word-frequency)<br>
change all tokens to lemmas, and repeat prior steps for lemmatized tokens

In [139]:
def stem_all_tokens_from_list(tokens_list):
    return [porter_stemmer.stem(token) for token in tokens_list]

In [140]:
core_1k_df["token_stems"] = core_1k_df["tokens"].progress_apply(lambda x: stem_all_tokens_from_list(x))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [157]:
core_1k_df[core_1k_df["name"].str.contains("ask", case=False)][["name","token_stems"]]

Unnamed: 0,name,token_stems
1,r/AskReddit,"[r, /, ask, reddit]"
77,r/ask,"[r, /, ask]"
89,r/AskMen,"[r, /, ask, men]"
124,r/TooAfraidToAsk,"[r, /, too, afraid, to, ask]"
144,r/AskUK,"[r, /, ask, uk]"
321,r/AskNYC,"[r, /, ask, nyc]"
385,r/askscience,"[r, /, ask, scienc]"
426,r/askTO,"[r, /, ask, to]"
466,r/AskMechanics,"[r, /, ask, mechan]"
467,r/AskAnAustralian,"[r, /, ask, an, australian]"


In [144]:
main_stems_counter = Counter()
core_1k_df.progress_apply(lambda x: add_to_counter(x["token_stems"], main_stems_counter), axis=1)

stop_words_count = {}
core_stems_count = {}
for word in main_stems_counter.keys():
    if word in stop_words:
        stop_words_count[word.lower()] = main_stems_counter[word]
    else:
        core_stems_count[word.lower()] = main_stems_counter[word]
        
print(core_stems_count)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'r': 1000, '/': 1000, 'home': 6, 'ask': 22, 'reddit': 4, 'mildli': 1, 'infuri': 1, 'face': 3, 'palm': 1, 'diablo': 2, '4': 14, 'wa': 2, 'attempt': 1, 'asshol': 2, 'damn': 1, 'interest': 3, 'world': 5, 'news': 4, 'white': 1, 'peopl': 4, 'twitter': 4, 'anti': 1, 'work': 3, 'live': 3, 'stream': 1, 'fail': 1, 'stupid': 3, 'question': 4, 'game': 27, 'leagu': 3, 'legend': 4, 'funni': 2, 'ah': 1, 'nba': 4, 'pic': 3, 'door': 2, 'dash': 2, 'public': 1, 'freakout': 1, 'movi': 3, 'ukrain': 4, 'war': 6, 'video': 4, 'report': 3, 'explain': 3, 'like': 1, 'im': 2, 'five': 2, 'polit': 5, 'pc': 5, 'master': 3, 'race': 3, 'piraci': 2, 'next': 2, 'fuck': 6, 'level': 1, 'today': 1, 'learn': 7, 'mine': 1, 'craft': 1, 'soccer': 2, 'gen': 4, 'shin': 3, '_': 28, 'impact': 2, 'leak': 3, 'crazi': 1, 'meme': 20, 'irl': 8, 'tik': 2, 'k': 5, 'cring': 4, 'unexpect': 1, 'chat': 2, 'gpt': 1, 'combat': 1, 'footag': 1, 'life': 5, 'pro': 2, 'tip': 5, 'wtf': 1, 'unpopular': 2, 'opinion': 2, 'build': 4, 'apc': 2, 'tif': 

In [159]:
sorted_core_stems_count = sorted(core_stems_count.items(), key=lambda x:x[1], reverse=True)
sorted_core_stems_count = [tup for tup in sorted_core_stems_count if not (re.search(r"\d", tup[0]) or tup[0] in ("r", "/", "_"))]
sorted_core_stems_count

with open("./output/most_common_core_stems_over_3.csv", "w") as scwcfile:
    scwcfile.write("word, freq, category\n")
    for x, y in sorted_core_stems_count:
        if int(y) >= 5:
            z = "most common"
        elif int(y) > 1:
            z = "less common"
        else:
            z = "just once"
        scwcfile.write(f"{x},{y},{z}\n")

In [147]:
# ok that didn't really work as expected... trying a lemmatizer

In [148]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [149]:
def lemmatize_all_tokens_from_list(tokens_list):
    return [lemmatizer.lemmatize(token) for token in tokens_list]

In [150]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sandrews/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [151]:
core_1k_df["token_lemmas_wordnet"] = core_1k_df["tokens"].progress_apply(lambda x: lemmatize_all_tokens_from_list(x))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [152]:
core_1k_df[["token_lemmas_wordnet"]]

Unnamed: 0,token_lemmas_wordnet
0,"[r, /, home]"
1,"[r, /, ask, reddit]"
2,"[r, /, mildly, infuriating]"
3,"[r, /, face, palm]"
4,"[r, /, diablo, 4]"
...,...
995,"[r, /, beyond, the, bump]"
996,"[r, /, structural, engineering]"
997,"[r, /, boxing]"
998,"[r, /, hy, pixel, sky, block]"


In [153]:
main_lemmas_counter = Counter()
core_1k_df.progress_apply(lambda x: add_to_counter(x["token_lemmas_wordnet"], main_lemmas_counter), axis=1)

stop_words_count = {}
core_lemmas_count = {}
for word in main_lemmas_counter.keys():
    if word in stop_words:
        stop_words_count[word.lower()] = main_lemmas_counter[word]
    else:
        core_lemmas_count[word.lower()] = main_lemmas_counter[word]
        
print(core_lemmas_count)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'r': 1001, '/': 1000, 'home': 6, 'ask': 22, 'reddit': 4, 'mildly': 1, 'infuriating': 1, 'face': 3, 'palm': 1, 'diablo': 2, '4': 14, 'wa': 2, 'attempt': 1, 'asshole': 2, 'damn': 1, 'thats': 2, 'interesting': 3, 'world': 5, 'news': 4, 'white': 1, 'people': 4, 'twitter': 4, 'anti': 1, 'work': 3, 'live': 2, 'stream': 1, 'fail': 1, 'stupid': 3, 'question': 4, 'gaming': 10, 'league': 3, 'legend': 4, 'funny': 2, 'ah': 1, 'nba': 4, 'pic': 3, 'door': 2, 'dash': 2, 'public': 1, 'freakout': 1, 'movie': 3, 'ukraine': 4, 'war': 6, 'video': 4, 'report': 3, 'explain': 2, 'like': 1, 'im': 2, 'five': 2, 'politics': 2, 'pc': 5, 'master': 3, 'race': 3, 'piracy': 2, 'next': 2, 'fucking': 3, 'level': 1, 'today': 1, 'learned': 1, 'mine': 1, 'craft': 1, 'soccer': 2, 'gen': 4, 'shin': 3, '_': 28, 'impact': 2, 'leak': 3, 'crazy': 1, 'meme': 20, 'irl': 8, 'tik': 2, 'k': 5, 'cringe': 4, 'unexpected': 1, 'chat': 2, 'gpt': 1, 'combat': 1, 'footage': 1, 'life': 5, 'pro': 2, 'tip': 5, 'wtf': 1, 'unpopular': 2, 'opi

In [200]:
sorted_core_lemmas_count = sorted(core_lemmas_count.items(), key=lambda x:x[1], reverse=True)
sorted_core_lemmas_count = [tup for tup in sorted_core_lemmas_count if not (re.search(r"\d", tup[0]) or tup[0] in ("r", "/", "_"))]
sorted_core_lemmas_count

less_common_dict, just_once_dict = {}, {}
with open("./output/most_common_core_lemmas.csv", "w") as scwcfile:
    scwcfile.write("word,freq\n")        
    for x, y in sorted(sorted_core_lemmas_count, key=lambda x: (x[1], x), reverse=True):
        if int(y) >= 5:
            scwcfile.write(f"{x},{y}\n")
        elif int(y) > 1:
            less_common_dict[x] = int(y)
        else:
            just_once_dict[x] = int(y)
    print(len(less_common_dict))
    print(len(just_once_dict))

with open("./output/less_common_core_lemmas_alphasorted.csv", "w") as less_common_lemmas_file:
    less_common_lemmas_file.write("less common (less than 5, greater than 2)\n")    
#     for k, v in sorted(less_common_dict.items(), key=lambda x:x[1], reverse=True): # valuesorted
    for k, v in sorted(less_common_dict.items()): # alphasorted
        less_common_lemmas_file.write(f"{k} ({v}), ")
    less_common_lemmas_file.write("\n")
    less_common_lemmas_file.write("just one\n")
    for k, v in sorted(just_once_dict.items()):
        less_common_lemmas_file.write(f"{k}, ")

271
886


In [201]:
", ".join([f"{k} ({v})" for k, v in sorted(sorted_core_lemmas_count, key=lambda x: (x[1], x), reverse=True)])   

'ask (22), meme (20), game (17), advice (15), gaming (10), irl (8), anime (8), war (6), marvel (6), home (6), girl (6), competitive (6), world (5), watch (5), uk (5), tip (5), star (5), pokemon (5), pc (5), life (5), k (5), finance (5), circle (5), career (5), car (5), video (4), ukraine (4), u (4), two (4), twitter (4), studio (4), sky (4), shit (4), science (4), school (4), reddit (4), question (4), porn (4), people (4), p (4), one (4), ok (4), news (4), new (4), nba (4), mod (4), linux (4), legend (4), legal (4), kai (4), joke (4), jerk (4), go (4), gen (4), food (4), dank (4), cringe (4), character (4), canada (4), buy (4), buddy (4), xbox (3), wow (3), work (3), window (3), tv (3), true (3), thought (3), tattoo (3), switch (3), stupid (3), shitty (3), shin (3), self (3), satisfying (3), sale (3), rim (3), report (3), red (3), reality (3), race (3), political (3), play (3), pirate (3), piece (3), pic (3), personal (3), online (3), oddly (3), n (3), movie (3), maybe (3), master (3),

### Now lemmatizing for the CountVectorizer implementation

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
def convert_list_into_string_with_spaces(l):
    # first two elements are r and /
    return " ".join(l[2:])

In [26]:
core_1k_df["joined_tokens"] = core_1k_df["tokens"].progress_apply(lambda x: convert_list_into_string_with_spaces(x))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [27]:
core_1k_df["joined_tokens"][:3]

0                  home
1            ask reddit
2    mildly infuriating
Name: joined_tokens, dtype: object

In [28]:
count_vectorizer = CountVectorizer(stop_words="english")

In [29]:
X = count_vectorizer.fit_transform(core_1k_df["joined_tokens"])

In [30]:
count_vectorizer.get_feature_names_out()[:10]

array(['abrupt', 'absolute', 'abu', 'academia', 'academy', 'accounting',
       'action', 'actual', 'add', 'addiction'], dtype=object)

In [31]:
pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names_out())

Unnamed: 0,abrupt,absolute,abu,academia,academy,accounting,action,actual,add,addiction,...,yeah,youtube,yugioh,yuzu,zealand,zelda,zombies,zombo,zone,zoro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
from nltk.stem.porter import PorterStemmer

In [33]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("fishes"))

fish


In [34]:
def stemming_tokenizer(str_input):
    words = str_input.lower().split()
    return [porter_stemmer.stem(word) for word in words]

In [35]:
stem_count_vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

In [36]:
X = stem_count_vectorizer.fit_transform(core_1k_df["joined_tokens"])



In [37]:
stem_count_vectorizer.get_feature_names_out()[:20]

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', 'abrupt',
       'absolut', 'abu', 'academi', 'academia', 'account', 'action',
       'actual', 'add'], dtype=object)

In [38]:
stem_count_df = pd.DataFrame(X.toarray(), columns=stem_count_vectorizer.get_feature_names_out())

In [39]:
sums = {}
for col in stem_count_df.columns:
    sums[col] = stem_count_df[col].sum()

In [40]:
sums_counter = Counter(sums)

In [41]:
sums_counter.most_common(200)

[('_', 28),
 ('game', 27),
 ('ask', 22),
 ('meme', 20),
 ('2', 15),
 ('advic', 15),
 ('4', 14),
 ('3', 12),
 ('anim', 10),
 ('0', 9),
 ('irl', 8),
 ('learn', 7),
 ('1', 6),
 ('competit', 6),
 ('fuck', 6),
 ('girl', 6),
 ('home', 6),
 ('marvel', 6),
 ('war', 6),
 ('car', 5),
 ('career', 5),
 ('circl', 5),
 ('financ', 5),
 ('k', 5),
 ('life', 5),
 ('pc', 5),
 ('person', 5),
 ('pokemon', 5),
 ('polit', 5),
 ('star', 5),
 ('thi', 5),
 ('tip', 5),
 ('uk', 5),
 ('watch', 5),
 ('world', 5),
 ('6', 4),
 ('7', 4),
 ('app', 4),
 ('buddi', 4),
 ('build', 4),
 ('buy', 4),
 ('canada', 4),
 ('charact', 4),
 ('cring', 4),
 ('d', 4),
 ('dank', 4),
 ('engin', 4),
 ('food', 4),
 ('gen', 4),
 ('jerk', 4),
 ('joke', 4),
 ('kai', 4),
 ('legal', 4),
 ('legend', 4),
 ('linux', 4),
 ('mod', 4),
 ('nba', 4),
 ('new', 4),
 ('news', 4),
 ('ok', 4),
 ('peopl', 4),
 ('pirat', 4),
 ('porn', 4),
 ('question', 4),
 ('reddit', 4),
 ('school', 4),
 ('scienc', 4),
 ('shit', 4),
 ('sky', 4),
 ('studio', 4),
 ('twitter', 

## Determine categories per subreddit

Subreddits were grouped together into categories with corresponding regex patterns using word frequencies (output above). 
The following regexes categorizes a hundred subreddit names successfully.

When run on a thousand or more, there are many that were assigned the "other" category.
A future task is to further refine the regex patterns to move "other" strings into more meaningful categories.
I also attempted to do topic modeling and classification on names and descriptions but had dubious results.
Will investigate^ further later, as writing regex patterns was a painful process. 

In [43]:
# create new df, dropping a random column that was not supposed to be there
# TODO: double-check what that column was!
new_1k_df = core_1k_df.drop(core_1k_df.columns[0], axis=1)
new_1k_df.head(2)

Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,subreddit_url,image_url,flourish_img_html,rank_str,stripped_name,tokens,joined_tokens
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009,135.2 thousand,https://www.reddit.com/r/Home,,,0,'Home',"[r, /, home]",home
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008,41.8 million,https://www.reddit.com/r/AskReddit,,,1,'AskReddit',"[r, /, ask, reddit]",ask reddit


In [92]:
regexes = {
    #0
    "advice and opinions": r"ask|question|thoughts|tip|suggestion|advice|opinion|help|asshole|aita|aitah|amiugly|trueoffmychest|roastme|offmychest|whatisthisthing|truerateme|whatisthisbug|amithedevil|confessions|amiwrong|confession|whatisthiscar|iamatotalpieceofshit|whatcarshouldibuy|whatsthisplant",
    
    #1
    "dank gifs and memes": r"meme|cringe|humor|funny|joke|smile|gif|dank|holup|aww|cursedcomments|clevercomebacks|oddlyspecific",
   
    #2
    "fails": r"facepalm|fail|wtf|tifu|shittytattoos|shitposting|therewasanattempt|whatcouldgowrong|awfuleverything|wellthatsucks",
    
    #3
    "mood": r"infuriating|interesting|unexpected|holdup|terrifying|fulfilling|satisfying|publicfreakout|pettyrevenge|beamazed|thatsinsane|flying",
    
    #4
    "science and growth": r"grow|learn|engineering|explain|improve|scien|financ|career|build|data|work|weird|jobs|legal|premed|realestate|stocks|entrepreneur|biology|mycology|accounting|diy|investing|trees|atheism|linkedinlunatics|creditcards|aviation|mcat|college|construction|amcstock|natureisfuckinglit|space|resumes|smallbusiness|unresolvedmysteries|futurology|autism|studentloans|productivity|insects|electrical|polls|economics|nursing|aliens|diwhy",
    
    #5
    "tech and software": r"tech|linux|pc|program|mac|rust|android|app|sysadmin|data|chatgpt|wallstreetbets|stablediffusion|hololive|3dprinting|webdev|chatgpt|characterai|3dprinting|cryptocurrency|selfhosted|mechanicalkeyboards|blender|headphones|monitors|mousereview|talesfromyourserver|iphone|googlepixel|teslamodel3|nikkemobile|tools|windows10|unrealengine|unity3d|oculusquest|photoshoprequest|bitcoin|binichdasarschloch|teslamotors|localllama|electricvehicles|windows11|playstation|reactjs|notion|firefox|oculus|3ds|ps4|cybersecurity|windows|golang|ipad|fl_studio|hacking|experienceddevs|ios|chrome|kotakuinaction",
    
    #6
    "world news and places": r"news|war|place|uk|canada|china|india|netherlands|singapore|russia|austrialia|polit|world|seattle|europe|philippines|france|melbourne|argentina|sweden|ireland|brasil|australia|de|unitedkingdom|geography|vancouver|london|ontario|sanfrancisco|arbeitsleben|romania|liverpoolfc|toronto|portugal|newzealand|denmark|hungary|bayarea|mexico|boston|shitamericanssay|austin|calgary|turkey|sydney|austria|ottawa|poland|washingtondc|edmonton|losangeles|chicago|serbia|malaysia|korea|oceangatetitan|imaginarymaps|denver|italia|nyc|texas|portland|croatia|military|finland|airforce|philadelphia|chile|houston|italy|army|montreal|dubai|auckland|britishcolumbia|ukrain.*",
    
    #7
    "games and media": r"gam|media|leagueoflegends|poke|anim|sims|reality|switch|batman|marvel|elden|movie|music|harry potter|fire emblem|video|comic|minecraft|diablo|leagueoflegends|genshin|onepiece|honkaistarrail|ffxiv|2007scape|wow|steam|manga|globaloffensive|dnd|dota|television|entertainment|zelda|spiderman|onepunchman|skyrim|chess|PS5|starfield|ffxvi|hearthstone|finanfantasy|battlefield2042|maplestory|fortnite|xbox|civ|midjourney|streetfighter|magictcg|overwatch|blackdesertonline|4chan|gtaonline|books|ps5|finalfantasy|fortnitebr|horror|reddeadredemption|kimetsunoyaiba|chainsawman|fallout|gundam|breath_of_the_wild|rupaulsdragrace|roblox|titanic|naruto|fivenightsatfreddys|halo|magicarena|simracing|loveislandtv|assassinscreed|successiontv|yugioh|dc_cinematic|mmorpg|ironscape|bleach|azurelane|monsterhunter|rpg|fiftyfifty|thelastairbender|bokunoheroacademia|manhwa|mtg|paydaytheheist|darksouls3|elderscrollsonline|masterduel|mortalkombat|residentevil|codcompetitive|bravorealhousewives|falloutmods|harrypotter|hollowknight|darksouls|startrek|thebear|bettercallsaul|fireemblemheroes|legendsofruneterra|sonicthehedgehog|metalgearsolid|competitiveoverwatch|fireemblem|undertale|unearthedarcana|breakingbad|codzombies|edmproduction|visualnovels|lotr|thesopranos|smashbros|raidshadowlegends|xenoblade_chronicles|pikabu|splatoon",
    
    #8
    "porny non-porn things": r"porn|uncensored",
    
    #9
    "culture and daily life": r"home|life|kpop|fitness|school|travel|cooking|fashion|culture|sports|food|nba|soccer|formula1|baseball|nfl|golf|boxing|hockey|tennis|pics|plumbing|frugal|mma|skincareaddiction|cricket|casualconversation|bjj|art|cycling|adulting|tattoodesigns|guitar|audiophile|drumkits|nosleep|30plusskincare|running|writing|peloton|baking|minipainting|beauty|steak|supplements|futebol|fragrance|weed|gardening|smoking|wwe|stopdrinking|instacelebsgossip|handbags|progresspics|orangetheory|bostonceltics",
    
    #10
    "people and relationships": r"buddy|relation|wedding|tinder|people|girls|sex|twoxchromosomes|teenagers|taylorswift|cats|apstudents|conservative|ufc|lgbt|iama|selfie|csmajors|parenting|teachers|humansbeingbros|dating|nofap|joerogan|saintmeghanmarkle|torontoraptors|celebs|furry_irl|lakers|truckers|childfree|socialskills|malelivingspace|trans|actuallesbians|irrationalmadness|kanye|vegan|raisedbynarcissists|electricians|neoliberal|virtualyoutubers|justneckbeardthings|2meirl4meirl|marriage|dogs|digitalnomad",
    
    #11
    "companies": r"doordash|discord|facebook|netflix|reddit|twitter|tumblr|instacart|twitch|tiktok|costco|walmart|dundermifflin|ubereats|nvidia|lego|youtube|bbby|intel|rolex|airbnb_hosts|bmw|limbuscompany|unitedairlines",
    
    #12
#     "other": r"|piracy|nextfuckinglevel|meirl|combatfootage|twohottakes|noncredibledefense|feedthebeast|maliciouscompliance|fauxmoi|maybemaybemaybe|valorant|squaredcircle|tearsofthekingdom|greentext|crackwatch|deadbydaylight|ufos|destiny|pathofexile|outoftheloop|bald|amd|atbge|projectzomboid|superstonk|stardewvalley|terraria|grandorder|escapefromtarkov|conspiracy|imthemaincharacter|deeprockgalactic|cps|competitivetft|baldursgate3|teamfighttactics|crusaderkings|oshinoko|gunners|asmongold|coolguides|xqcow|cars|roms|reddevils|runescape|hoi4|stellaris|battlebitremastered|totk|eu4|firstimpression|justrolledintotheshop|leopardsatemyface|citiesskylines|starterpacks|contagiouslaughter|ich_iel|ksi|kgbtr|40klore|changemyview|starcitizen|arknights|valorantcompetitive|valheim|battlestations|twosentencehorror|recruitinghell|whenthe|loseit|apexlegends|boxoffice|chelseafc|fuckcars|cracksupport|choosingbeggars|adhd|iasip|thesilphroad|polska|bollyblindsngossip|destiny2|finanzen|sipstea|watches|factorio|hiphopheads|celebhub|motorcycles|huntshowdown|redscarepod|h3h3productions|rainbow6|shitpostcrusaders|rareinsults|iamthemaincharacter|plex|jrpg|twobestfriendsplay|tf2|newgreentexts|edh|tressless|tjournal_refugees|jeeneetards|fo76|singularity|hfy|berserk|fantasy|youshouldknow|nonononoyes|epicseven|persona|residency|pathfinder_kingmaker|latestagecapitalism|fo4|privacy|delta|sneakers|btd6|rocketleague|whowouldwin|pathfinder2e|trashy|popheads|pandabuy|noahgettheboat|mtf|jujutsushi|bluearchive|clashroyale|cfb|7daystodie|rogally|persona5|csgo|trashtaste|themajorityreport|winstupidprizes|ultrawidemasterrace|repsneakers|yuzu|coys|eulamains|ark|moreplatesmoredates|namenerds|197|blankies|wohnen|daddit|divinityoriginalsin|bindingofisaac|godot|fixedbytheduet|collapse|synology|playboicarti|vexillologycirclejerk|fantasypl|allthemods|hydrohomies|clashofclans|assettocorsa|fut|darktide|forsen|ubiquiti|overclocking|bogleheads|seaofthieves|masseffect|zorozone|gundeals|ibo|overemployed|beichtstuhl|subnautica|vaushv|urbanhell|nattyorjuice|absoluteunits|ratschlag|highstrangeness|madlads|vexillology|bannerlord|bluelock|thefighterandthekid|emulation|neovim|calamitymod|abruptchaos|lowsodiumcyberpunk|guiltygear|stunfisk|eve|razer|thedivision|19684|wordington|forzahorizon|msp|hoggit|piratefolk|nier|enoughmuskspam|genp|tekken|lastimages|kitchenconfidential|nova|outfits|pikmin|airsoft|thinkpad|justunsubbed|criticalrole|competitiveapex|bambulab|fallenorder|kiddions|dayz|onebag|mtb|gunpla|swtor|propagandaposters|characterrant|assetto_pirate|bocchitherock|kengan_ashura|stardustcrusaders|conanexiles|omori|riskofrain|aoe2|needforspeed|brawlstars|blackmagicfuckery|youmo|faces|wrasslin|obsidianmd|helluvaboss|beyondthebump|hypixelskyblock"
}

In [93]:
new_1k_df[:100]

Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,...,science and growth,tech and software,world news and places,games and media,porny non-porn things,culture and daily life,people and relationships,companies,all_cols,hyperlink
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009,135.2 thousand,...,,,,,,"<re.Match object; span=(2, 6), match='home'>",,,[culture and daily life],<a href='https://www.reddit.com/r/Home'>r/Home...
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008,41.8 million,...,,,,,,,,"<re.Match object; span=(5, 11), match='reddit'>","[advice and opinions, companies]",<a href='https://www.reddit.com/r/AskReddit'>r...
2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years,2012,6.0 million,...,,,,,,,,,[mood],<a href='https://www.reddit.com/r/mildlyinfuri...
3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years,2010,7.5 million,...,,,,,,,,,[fails],<a href='https://www.reddit.com/r/facepalm'>r/...
4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years,2011,746.5 thousand,...,,,,"<re.Match object; span=(2, 8), match='diablo'>",,,,,[games and media],<a href='https://www.reddit.com/r/diablo4'>r/d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,r/television,,5_2qh6e,16926100,0,2023-07-07,15,years,2008,16.9 million,...,,,,"<re.Match object; span=(2, 12), match='televis...",,,,,[games and media],<a href='https://www.reddit.com/r/television'>...
96,r/PersonalFinanceCanada,This subreddit is a place to discuss anything ...,5_2tasy,1208852,0,2023-07-07,11,years,2012,1.2 million,...,"<re.Match object; span=(10, 16), match='financ'>",,"<re.Match object; span=(17, 23), match='canada'>",,,,,,"[science and growth, world news and places]",<a href='https://www.reddit.com/r/PersonalFina...
97,r/travel,r/travel is a community about exploring the wo...,5_2qh41,7912406,0,2023-07-07,15,years,2008,7.9 million,...,,,,,,"<re.Match object; span=(2, 8), match='travel'>",,,[culture and daily life],<a href='https://www.reddit.com/r/travel'>r/tr...
98,r/FashionReps,Reddit's largest community for the discussion ...,5_31hcv,1324942,0,2023-07-07,9,years,2014,1.3 million,...,,,,,,"<re.Match object; span=(2, 9), match='fashion'>",,,[culture and daily life],<a href='https://www.reddit.com/r/FashionReps'...


In [94]:
for k, v in regexes.items():
    new_1k_df[k] = new_1k_df["name"].apply(lambda x: re.search(v, x.lower()))

In [95]:
new_1k_df

Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,...,science and growth,tech and software,world news and places,games and media,porny non-porn things,culture and daily life,people and relationships,companies,all_cols,hyperlink
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009,135.2 thousand,...,,,,,,"<re.Match object; span=(2, 6), match='home'>",,,[culture and daily life],<a href='https://www.reddit.com/r/Home'>r/Home...
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008,41.8 million,...,,,,,,,,"<re.Match object; span=(5, 11), match='reddit'>","[advice and opinions, companies]",<a href='https://www.reddit.com/r/AskReddit'>r...
2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years,2012,6.0 million,...,,,,,,,,,[mood],<a href='https://www.reddit.com/r/mildlyinfuri...
3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years,2010,7.5 million,...,,,,,,,,,[fails],<a href='https://www.reddit.com/r/facepalm'>r/...
4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years,2011,746.5 thousand,...,,,,"<re.Match object; span=(2, 8), match='diablo'>",,,,,[games and media],<a href='https://www.reddit.com/r/diablo4'>r/d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,r/beyondthebump,"A place for new parents, new parents to be, an...",5_2u06v,634170,9,2023-07-07,11,years,2012,634.2 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/beyondthebum...
996,r/StructuralEngineering,Structural engineering is the use of applied p...,5_2ttcu,56876,9,2023-07-07,11,years,2012,56.9 thousand,...,"<re.Match object; span=(12, 23), match='engine...",,,,,,,,[science and growth],<a href='https://www.reddit.com/r/StructuralEn...
997,r/Boxing,"Welcome to /r/boxing, reddit's home for pro bo...",5_2qj0l,1729819,9,2023-07-07,15,years,2008,1.7 million,...,,,,,,"<re.Match object; span=(2, 8), match='boxing'>",,,[culture and daily life],<a href='https://www.reddit.com/r/Boxing'>r/Bo...
998,r/HypixelSkyblock,The official unofficial Hypixel Skyblock subre...,5_13bsg7,90364,9,2023-07-07,4,years,2019,90.4 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/HypixelSkybl...


In [96]:
new_1k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   name                      1000 non-null   object
 1   desc                      991 non-null    object
 2   page_id                   1000 non-null   object
 3   num_members               1000 non-null   object
 4   page_nr                   1000 non-null   int64 
 5   date_retrieved            1000 non-null   object
 6   age_num                   1000 non-null   int64 
 7   age_word                  1000 non-null   object
 8   created_dt                1000 non-null   int64 
 9   human_num_members         1000 non-null   object
 10  subreddit_url             1000 non-null   object
 11  image_url                 666 non-null    object
 12  flourish_img_html         666 non-null    object
 13  rank_str                  1000 non-null   int64 
 14  stripped_name            

In [97]:
new_1k_df["all_cols"] = new_1k_df.iloc[:, 17:29].apply(lambda row: new_1k_df.columns[17:29].to_numpy()[row.notna().to_numpy()].tolist(), axis=1)

In [98]:
new_1k_df["hyperlink"] = new_1k_df.apply(lambda x: "<a href='" + x["subreddit_url"] + "'>" + x["name"] + "</a>", axis=1)

In [99]:
new_1k_df[:100].to_csv("./top_100_with_categories_3.csv")

In [100]:
new_1k_df[:1000].to_csv("./top_1000_with_categories_3.csv")

In [101]:
new_1k_df

Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,...,science and growth,tech and software,world news and places,games and media,porny non-porn things,culture and daily life,people and relationships,companies,all_cols,hyperlink
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009,135.2 thousand,...,,,,,,"<re.Match object; span=(2, 6), match='home'>",,,[culture and daily life],<a href='https://www.reddit.com/r/Home'>r/Home...
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008,41.8 million,...,,,,,,,,"<re.Match object; span=(5, 11), match='reddit'>","[advice and opinions, companies]",<a href='https://www.reddit.com/r/AskReddit'>r...
2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years,2012,6.0 million,...,,,,,,,,,[mood],<a href='https://www.reddit.com/r/mildlyinfuri...
3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years,2010,7.5 million,...,,,,,,,,,[fails],<a href='https://www.reddit.com/r/facepalm'>r/...
4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years,2011,746.5 thousand,...,,,,"<re.Match object; span=(2, 8), match='diablo'>",,,,,[games and media],<a href='https://www.reddit.com/r/diablo4'>r/d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,r/beyondthebump,"A place for new parents, new parents to be, an...",5_2u06v,634170,9,2023-07-07,11,years,2012,634.2 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/beyondthebum...
996,r/StructuralEngineering,Structural engineering is the use of applied p...,5_2ttcu,56876,9,2023-07-07,11,years,2012,56.9 thousand,...,"<re.Match object; span=(12, 23), match='engine...",,,,,,,,[science and growth],<a href='https://www.reddit.com/r/StructuralEn...
997,r/Boxing,"Welcome to /r/boxing, reddit's home for pro bo...",5_2qj0l,1729819,9,2023-07-07,15,years,2008,1.7 million,...,,,,,,"<re.Match object; span=(2, 8), match='boxing'>",,,[culture and daily life],<a href='https://www.reddit.com/r/Boxing'>r/Bo...
998,r/HypixelSkyblock,The official unofficial Hypixel Skyblock subre...,5_13bsg7,90364,9,2023-07-07,4,years,2019,90.4 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/HypixelSkybl...


In [102]:
# just checking...
new_1k_df[:100][["name", "all_cols", "tokens"]].tail(50)

Unnamed: 0,name,all_cols,tokens
50,r/Serverlife,[culture and daily life],"[r, /, server, life]"
51,r/BestofRedditorUpdates,[companies],"[r, /, best, of, reddit, or, updates]"
52,r/OnePiece,[games and media],"[r, /, one, piece]"
53,r/anime,[games and media],"[r, /, anime]"
54,r/TwoXChromosomes,[people and relationships],"[r, /, two, x, chromosomes]"
55,r/HonkaiStarRail,[games and media],"[r, /, hon, kai, star, rail]"
56,r/ffxiv,[games and media],"[r, /, ffxi, v]"
57,r/2007scape,[games and media],"[r, /, 2, 0, 0, 7, scape]"
58,r/wow,[games and media],"[r, /, wow]"
59,r/wallstreetbets,[tech and software],"[r, /, wallstreet, bets]"


In [103]:
def add_to_counter(tokens_list, main_counter):
    new_c = Counter(tokens_list)
    main_counter += new_c

In [104]:
# find rows with no categories assigned
new_1k_df[new_1k_df["all_cols"].str.len() == 0]

Unnamed: 0,name,desc,page_id,num_members,page_nr,date_retrieved,age_num,age_word,created_dt,human_num_members,...,science and growth,tech and software,world news and places,games and media,porny non-porn things,culture and daily life,people and relationships,companies,all_cols,hyperlink
27,r/Piracy,⚓ A community devoted to in-depth debate on to...,5_2qmox,1202524,0,2023-07-07,14,years,2009,1.2 million,...,,,,,,,,,[],<a href='https://www.reddit.com/r/Piracy'>r/Pi...
28,r/nextfuckinglevel,A subreddit for gifs and videos that are on an...,5_m0bnr,8172280,0,2023-07-07,4,years,2019,8.2 million,...,,,,,,,,,[],<a href='https://www.reddit.com/r/nextfuckingl...
36,r/meirl,"you, me, us, irl, reddit style",5_2s5ti,2468805,0,2023-07-07,12,years,2011,2.5 million,...,,,,,,,,,[],<a href='https://www.reddit.com/r/meirl'>r/mei...
41,r/CombatFootage,A forum for combat footage and photos from his...,5_2v0c6,1458861,0,2023-07-07,10,years,2013,1.5 million,...,,,,,,,,,[],<a href='https://www.reddit.com/r/CombatFootag...
80,r/TwoHotTakes,COPYRIGHT NOTICE: Things posted on this page a...,5_4hlyni,184290,0,2023-07-07,2,years,2021,184.3 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/TwoHotTakes'...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,r/Wrasslin,Welcome to /r/Wrasslin! The dumping grounds fo...,5_2wzap,130718,9,2023-07-07,10,years,2013,130.7 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/Wrasslin'>r/...
986,r/ObsidianMD,Subreddit for the Obsidian notes app https://o...,5_2mz3dr,78958,9,2023-07-07,3,years,2020,79.0 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/ObsidianMD'>...
994,r/HelluvaBoss,For the VivziePop animation property Helluva B...,5_282xv9,132956,9,2023-07-07,3,years,2020,133.0 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/HelluvaBoss'...
995,r/beyondthebump,"A place for new parents, new parents to be, an...",5_2u06v,634170,9,2023-07-07,11,years,2012,634.2 thousand,...,,,,,,,,,[],<a href='https://www.reddit.com/r/beyondthebum...


In [105]:
# find most common tokens, of those without assigned categories
# this is used to determine additional regex patterns

In [106]:
tagging_counter = Counter()

In [107]:
new_1k_df[new_1k_df["all_cols"].str.len() == 0].progress_apply(lambda x: add_to_counter(x["tokens"], tagging_counter), axis=1)

  0%|          | 0/224 [00:00<?, ?it/s]

27     None
28     None
36     None
41     None
80     None
       ... 
985    None
986    None
994    None
995    None
998    None
Length: 224, dtype: object

In [108]:
tagging_counter.most_common(100)

[('r', 224),
 ('/', 224),
 ('the', 17),
 ('of', 7),
 ('4', 5),
 ('to', 5),
 ('_', 4),
 ('i', 4),
 ('6', 4),
 ('maybe', 3),
 ('character', 3),
 ('competitive', 3),
 ('3', 3),
 ('it', 3),
 ('heads', 3),
 ('h', 3),
 ('7', 3),
 ('no', 3),
 ('two', 2),
 ('valor', 2),
 ('ant', 2),
 ('circle', 2),
 ('green', 2),
 ('crack', 2),
 ('main', 2),
 ('in', 2),
 ('as', 2),
 ('cars', 2),
 ('just', 2),
 ('my', 2),
 ('ich', 2),
 ('hell', 2),
 ('apex', 2),
 ('n', 2),
 ('2', 2),
 ('t', 2),
 ('fo', 2),
 ('fantasy', 2),
 ('you', 2),
 ('person', 2),
 ('a', 2),
 ('sneakers', 2),
 ('win', 2),
 ('blue', 2),
 ('clash', 2),
 ('more', 2),
 ('1', 2),
 ('9', 2),
 ('vexillology', 2),
 ('asset', 2),
 ('for', 2),
 ('or', 2),
 ('pirate', 2),
 ('piracy', 1),
 ('next', 1),
 ('fucking', 1),
 ('level', 1),
 ('me', 1),
 ('irl', 1),
 ('combat', 1),
 ('footage', 1),
 ('hot', 1),
 ('takes', 1),
 ('feed', 1),
 ('beast', 1),
 ('malicious', 1),
 ('compliance', 1),
 ('faux', 1),
 ('moi', 1),
 ('squared', 1),
 ('tears', 1),
 ('kingdo

In [110]:
# get more regexes
# iteration 2
new_1k_df[new_1k_df["all_cols"].str.len() == 0][["name", "all_cols"]].to_csv("./data/get_more_regex.csv")

In [111]:
# file with unclassified subreddits was loaded into vs code,
# manually tagged with integers corresponding to the categories specified above,
# then reloaded into a new dataframe

# this was run previously, on a prior `get_more_regex.csv`
# the following cells reflect the first iteration of this process
get_more_regex_df = pd.read_csv("./data/get_more_regex_tagged.csv")

In [112]:
# strip off r/
get_more_regex_df["regex"] = get_more_regex_df["name"].apply(lambda x: x[2:])

In [113]:
get_more_regex_df["regex"]

0      therewasanattempt
1                   pics
2         PublicFreakout
3                 Piracy
4       nextfuckinglevel
             ...        
573          HelluvaBoss
574        beyondthebump
575      HypixelSkyblock
576        bostonceltics
577             splatoon
Name: regex, Length: 578, dtype: object

In [114]:
# create new regex_list to merge into regexes dict
regex_list = [""] * 13
def add_to_regexes(row, regex_list):
    print(int(row["all_cols"]))
    regex_list[int(row["all_cols"])] += f"|{row['regex'].lower()}"

In [116]:
get_more_regex_df.apply(lambda x: add_to_regexes(x, regex_list), axis=1)

2
9
3
12
12
12
5
12
10
1
12
12
12
12
12
12
2
3
3
0
12
12
12
12
12
12
10
12
4
12
12
6
12
12
12
10
12
7
7
11
7
7
10
6
11
0
12
7
4
6
12
12
12
9
7
12
0
1
12
12
12
5
12
7
0
12
12
12
9
7
12
12
12
7
4
3
12
12
12
12
6
12
12
12
5
6
5
9
10
10
12
12
12
12
12
12
12
12
12
10
2
12
12
12
12
5
12
7
12
7
5
1
7
12
7
12
12
12
12
12
12
12
12
12
0
12
12
2
10
12
10
12
12
12
9
10
7
12
12
7
5
6
7
5
6
7
7
6
12
7
4
5
5
4
6
0
5
0
5
6
12
6
5
9
9
5
12
4
10
5
7
6
6
10
5
9
7
6
7
12
12
12
9
12
12
4
10
12
10
12
11
0
0
12
7
0
4
7
7
7
12
12
12
5
6
6
6
9
12
7
7
5
10
5
7
12
12
9
0
12
12
1
12
12
6
6
4
12
11
7
7
12
12
4
7
4
0
4
4
7
10
12
12
12
12
12
12
12
12
10
7
12
5
5
12
12
12
4
12
6
12
7
12
12
6
10
12
12
12
12
7
12
12
7
12
5
5
10
12
12
6
7
6
7
12
12
12
7
12
7
5
7
10
4
6
9
4
5
7
7
9
6
6
9
12
0
12
12
7
7
1
12
12
6
12
11
10
12
12
10
5
12
6
9
6
12
12
4
7
6
4
11
9
9
4
12
6
12
12
12
5
12
7
10
12
6
12
12
11
12
6
7
9
7
12
7
12
10
12
5
6
6
4
5
12
12
7
12
10
12
10
10
7
12
12
12
7
9
12
3
6
12
12
9
5
5
0
12
7
12
6
7
5
6
12
4
12
4
12

0      None
1      None
2      None
3      None
4      None
       ... 
573    None
574    None
575    None
576    None
577    None
Length: 578, dtype: object

In [117]:
# Merge list below with regexes dict, then re-run category assignment and repeat if needed
regex_list

['|trueoffmychest|roastme|offmychest|whatisthisthing|truerateme|whatisthisbug|amithedevil|confessions|amiwrong|confession|whatisthiscar|iamatotalpieceofshit|whatcarshouldibuy|whatsthisplant|trueoffmychest|roastme|offmychest|whatisthisthing|truerateme|whatisthisbug|amithedevil|confessions|amiwrong|confession|whatisthiscar|iamatotalpieceofshit|whatcarshouldibuy|whatsthisplant',
 '|holup|aww|cursedcomments|clevercomebacks|oddlyspecific|holup|aww|cursedcomments|clevercomebacks|oddlyspecific',
 '|therewasanattempt|whatcouldgowrong|awfuleverything|wellthatsucks|therewasanattempt|whatcouldgowrong|awfuleverything|wellthatsucks',
 '|publicfreakout|pettyrevenge|beamazed|thatsinsane|flying|publicfreakout|pettyrevenge|beamazed|thatsinsane|flying',
 '|weird|jobs|legal|premed|realestate|stocks|entrepreneur|biology|mycology|accounting|diy|investing|trees|atheism|linkedinlunatics|creditcards|aviation|mcat|college|construction|amcstock|natureisfuckinglit|space|resumes|smallbusiness|unresolvedmysteries|