# Imports

In [19]:
#misc
import logging
import os
import pprint
logging.basicConfig(level=logging.INFO)

# env vars
import os
from dotenv import load_dotenv

# data viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

# data
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', -1)

#nlp
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
# Leaving this in for future work, but couldn't get current Sense2Vec 1.0.0a2 to work 
# from sense2vec import Sense2VecComponent
from tqdm import tqdm_notebook, notebook

import gensim
import gensim.corpora as corpora
from gensim import matutils
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

# LDA ended up not being very effective
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [22]:
# Constants
load_dotenv(verbose=True, override=True)

DATA_PATH = os.getenv("DATA_PATH")
RANDOM_STATE = os.getenv("RANDOM_STATE")
PICKLE_PATH = os.getenv("PICKLE_PATH")
nltk_stop_words = stopwords.words('english')
SAMPLE_SIZE = 2000
WORKERS = 3

# Original data

In [None]:
df_full = pd.read_json(f'{DATA_PATH}/nosleep.txt',lines=True)

# Clean

In [None]:
def clean_frame(frame):
    df = frame.copy()
    
    ##############
    # Duplicates
    ##############
    df = df.drop_duplicates(subset='id')
    
    #######
    # NAs
    #######
    df = df.dropna(subset=['selftext'])
    #df = df[df['selftext'] != '[removed]']
    #df = df[df['selftext'] != '[deleted]']
    df = df[df['selftext'].str.len() > 300]
    df = df[df['score'] > 0]
    
    ########
    # Domain-Specific Filtering
    ########
    df['selftext'] = df['selftext'].replace(value = '', regex = 'amp;\\S+') 
    df = df[df['title'].str.contains('voting thread') != True]
    df = df[df['title'].str.contains('NoSleep') != True]
    
    
    ##################
    # Date processing
    ##################
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df['created_year'] = pd.DatetimeIndex(df['created_utc']).year
    df['created_month'] = pd.DatetimeIndex(df['created_utc']).month
    #df['edited'] = pd.to_datetime(df['edited'], unit='s')
    #df['retrieved_on'] = pd.to_datetime(df['retrieved_on'], unit='s')
    #df['author_created_utc'] = pd.to_datetime(df['author_created_utc'], unit='s')

    mask = ['author',
        'created_utc',
        'created_year',
        'created_month', 
        'score', 
        'num_comments',
        'full_link', 
        'title', 
        'selftext',]
    
    return df[mask] 

In [None]:
# clean
df_clean = clean_frame(df_full)
# sort
df_clean.sort_values(['score'], ascending=False, inplace=True)
# pickle
df_clean.to_pickle(f'{DATA_PATH}/reddit_df_clean.pkl')

# Sample to build initial model

I doubled the sample size iteratively until the resulting topics stopped changing 200->400->800->1600->3200

In [None]:
df_clean_sub = df_clean.head(25000).sample(3200, random_state=RANDOM_STATE)
# this is small enough to commit to github
df_clean_sub.to_pickle(f'{PICKLE_PATH}/reddit_df_clean_sub.pkl')

In [2]:
df_sample = pd.read_pickle(f'{PICKLE_PATH}/reddit_df_clean_sub.pkl')

# NLP on Sample Data

## Build Stopwords List

In [59]:
stop_words_pd = pd.read_csv('news-stopwords/sw1k.csv')
stop_words_pd.dropna(subset=['term'], inplace=True)
nltk_stop_words.extend(stop_words_pd['term'].tolist())

english_lit = 'faye, mel, morrison, natasha, solomon, josie, liz, tess, cameron, aiden, steph, luke, lucas, josh, matt, amelia, dylan, jake, pete, ethan, gabe, sandler, jessica, sander, dee, nate, tasha, trevor, candice, janie, cameron, willy, mikey, tess, alexs, avi, zack, lindsay, angie, avery, patty, chrissy, jude, finn, spencer, riley, kelsey, alyssa, meredith, savannah, evan, stan, sophie, timmy, jess, joey, liam, alex, nick, angela, alex, emma, lily, phil, sam, oliver, eric, claire, Mary, Katherine, Josephine, Ana, Heidi, Patricia, Joan, Thelma, Renee, Glenda, Linda, Ashley, Shannon, Ida, Lydia, Barbara, Judith, Sheila, Vivian, Viola, Elizabeth, Rose, Ethel, Roberta, Courtney, Jennifer, Janice, Ellen, Holly, Marian, Maria, Kelly, Elaine, Brittany, Stella, Susan, Nicole, Marjorie, Melanie, Caroline, Margaret, Judy, Carrie, Loretta, Dora, Dorothy, Christina, Charlotte, Yolanda, Jo, Lisa, Kathy, Monica, Jeanette, Vickie, Nancy, Theresa, Esther, Laurie, Mattie, Karen, Beverly, Pauline, Katie, Terry, Betty, Denise, Emma, Kristen, Maxine, Helen, Tammy, Juanita, Vanessa, Irma, Sandra, Irene, Anita, Alma, Mabel, Donna, Jane, Rhonda, Sue, Marsha, Carol, Lori, Hazel, Elsie, Myrtle, Ruth, Rachel, Amber, Beth, Lena, Sharon, Marilyn, Eva, Jeanne, Christy, Michelle, Andrea, Debbie, Vicki, Deanna, Laura, Kathryn, April, Carla, Patsy, Sarah, Louise, Leslie, Tara, Hilda, Kimberly, Sara, Clara, Rosemary, Gwendolyn, Deborah, Anne, Lucille, Eileen, Jennie, Jessica, Jacqueline, Jamie, Terri, Nora, Shirley, Wanda, Joanne, Gertrude, Margie, Cynthia, Bonnie, Eleanor, Lucy, Nina, Angela, Julia, Valerie, Tonya, Cassandra, Melissa, Ruby, Danielle, Ella, Leah, Brenda, Lois, Megan, Stacey, Penny, Amy, Tina, Alicia, Wilma, Kay, Anna, Phyllis, Suzanne, Gina, Priscilla, Rebecca, Norma, Michele, Kristin, Naomi, Virginia, Paula, Gail, Jessie, Carole, Kathleen, Diana, Bertha, Natalie, Brandy, Pamela, Annie, Darlene, Agnes, Olga, Martha, Lillian, Veronica, Vera, Billie, Debra, Emily, Jill, Willie, Dianne, Amanda, Robin, Erin, Charlene, Tracey, Stephanie, Peggy, Geraldine, Bessie, Leona, Carolyn, Crystal, Lauren, Delores, Jenny, Christine, Gladys, Cathy, Melinda, Felicia, Marie, Rita, Joann, Pearl, Sonia, Janet, Dawn, Lorraine, Arlene, Miriam, Catherine, Connie, Lynn, Maureen, Velma, Frances, Florence, Sally, Colleen, Becky, Ann, Tracy, Regina, Allison, Bobbie, Joyce, Edna, Erica, Tamara, Violet, Diane, Tiffany, Beatrice, Joy, Kristina, Alice, Carmen, Dolores, Georgia, Toni, Julie, Rosa, Bernice, Constance, Misty, Heather, Cindy, Audrey, Lillie, Mae, Teresa, Grace, Yvonne, Claudia, Shelly, Doris, Wendy, Annette, Jackie, Daisy, Gloria, Victoria, June, Marcia, Ramona, Evelyn, Edith, Samantha, Tanya, Sherri, Jean, Kim, Marion, Nellie, Erika, Cheryl, Sherry, Dana, Minnie, Katrina, Mildred, Sylvia, Stacy, Marlene, Claire , James, Willie, Chad, Zachary, Mathew, John, Ralph, Jacob, Corey, Tyrone, Robert, Lawrence, Lee, Herman, Darren, Michael, Nicholas, Melvin, Maurice, Lonnie, William, Roy, Alfred, Vernon, Lance, David, Benjamin, Kyle, Roberto, Cody, Richard, Bruce, Francis, Clyde, Julio, Charles, Brandon, Bradley, Glen, Kelly, Joseph, Adam, Jesus, Hector, Kurt, Thomas, Harry, Herbert, Shane, Allan, Christopher, Fred, Frederick, Ricardo, Nelson, Daniel, Wayne, Ray, Sam, Guy, Paul, Billy, Joel, Rick, Clayton, Mark, Steve, Edwin, Lester, Hugh, Donald, Louis, Don, Brent, Max, George, Jeremy, Eddie, Ramon, Dwayne, Kenneth, Aaron, Ricky, Charlie, Dwight, Steven, Randy, Troy, Tyler, Armando, Edward, Howard, Randall, Gilbert, Felix, Brian, Eugene, Barry, Gene, Jimmie, Ronald, Carlos, Alexander, Marc, Everett, Anthony, Russell, Bernard, Reginald, Jordan, Kevin, Bobby, Mario, Ruben, Ian, Jason, Victor, Leroy, Brett, Wallace, Matthew, Martin, Francisco, Angel, Ken, Gary, Ernest, Marcus, Nathaniel, Bob, Timothy, Phillip, Micheal, Rafael, Jaime, Jose, Todd, Theodore, Leslie, Casey, Larry, Jesse, Clifford, Edgar, Alfredo, Jeffrey, Craig, Miguel, Milton, Alberto, Frank, Alan, Oscar, Raul, Dave, Scott, Shawn, Jay, Ben, Ivan, Eric, Clarence, Jim, Chester, Johnnie, Stephen, Sean, Tom, Cecil, Sidney, Andrew, Philip, Calvin, Duane, Byron, Raymond, Chris, Alex, Franklin, Julian, Gregory, Johnny, Jon, Andre, Isaac, Joshua, Earl, Ronnie, Elmer, Morris, Jerry, Jimmy, Bill, Brad, Clifton, Dennis, Antonio, Lloyd, Gabriel, Willard, Walter, Danny, Tommy, Ron, Daryl, Patrick, Bryan, Leon, Mitchell, Ross, Peter, Tony, Derek, Roland, Virgil, Harold, Luis, Warren, Arnold, Andy, Douglas, Mike, Darrell, Harvey, Marshall, Henry, Stanley, Jerome, Jared, Salvador, Carl, Leonard, Floyd, Adrian, Perry, Arthur, Nathan, Leo, Karl, Kirk, Ryan, Dale, Alvin, Cory, Sergio, Roger, Manuel, Tim, Claude, Marion, Joe, Rodney, Wesley, Erik, Tracy, Juan, Curtis, Gordon, Darryl, Seth, Jack, Norman, Dean, Jamie, Kent, Albert, Allen, Greg, Neil, Terrance, Jonathan, Marvin, Jorge, Jessie, Rene, Justin, Vincent, Dustin, Christian, Eduardo, Terry, Glenn, Pedro, Javier, Terrence, Gerald, Jeffery, Derrick, Fernando, Enrique, Keith, Travis, Dan, Clinton, Freddie, Samuel, Jeff, Lewis, Ted, Wade'
english_lit_stopwords = english_lit.lower().split(', ')
nltk_stop_words.extend(english_lit_stopwords)

nltk_stop_words.extend([
    'know', 'like', 'look', 'nosleep', 'reddit', 'post', '’s', "\'s", "'s",
    'stuff', 'subreddit', 'gt', 'lt', 'en',  'r', 'comment', 'comments',
    'gon', 'na', 'shit', 'guys', 'yeah', 'damn', 'dude', 'guess', 'ass',
    'mods', 'update', 'part', 'one', 'two', 'three', 'four', 'five', 'six',
    'seven', 'eight', 'nine', 'nsfw', 'previous', 'events', 'information',
    'edit', 'edited', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 
    'simply', 'anyway', 'whatever', 'fine', 'do', 'not', 's', 'r', 'kinda',
    'so', 'sometimes', 'maybe', 'ok'
])

stop_words = nltk_stop_words

## Pipeline

In [60]:
def docs_to_bows(docs):
    for doc in docs:
        yield(gensim.utils.simple_preprocess(str(doc), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm_notebook(texts)]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in tqdm_notebook(texts):
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def process_data(docs, stop_words):
    # Remove Emails
    docs = [re.sub(r'\S*@\S*\s?', '', doc) for doc in docs]

    # Normalize long chains of xxxs, these are redactions in the corpus
    docs = [re.sub(r'xxx?', '', doc) for doc in docs]

    # Normalize long chains of ooos
    docs = [re.sub(r'oo+\b', 'o', doc) for doc in docs]

    # Remove URLs
    docs = [re.sub(r'http\S+', '', doc) for doc in docs]

    # Remove new line characters
    docs = [re.sub(r'\s+', ' ', doc) for doc in docs]

    # Remove single quotes
    docs = [re.sub("\'", "", doc) for doc in docs]

    # lowercase, this would break NER, but don't need it
    docs = [doc.lower() for doc in docs]

    # turn docs into bags of words
    list_of_bows = list(docs_to_bows(docs))

    # Build the bigram models
    bigram = gensim.models.Phrases(list_of_bows, min_count=5, threshold=50) # higher threshold fewer phrases.

    # Faster way to get a sentence clubbed as a bigram
    bigram_model = gensim.models.phrases.Phraser(bigram)

    # Remove Stop Words before after phrasing, but before spacy for efficiency
    list_of_bows_nostops = remove_stopwords(list_of_bows, stop_words)

    # Form Bigrams
    list_of_bows_w_bigrams = [bigram_model[doc] for doc in list_of_bows_nostops]

    # keeping only tagger component (for efficiency)
    nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv, proper nount
    data_lemmatized = lemmatization(list_of_bows_w_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'])

    # second stopword pass on lemmatized words
    data_lemmatized = remove_stopwords(data_lemmatized, stop_words)

    print(data_lemmatized[:1])
    return data_lemmatized


## Topic Modeling (random 3,200 docs of top upvoted 25,000 of total 250,000; 1.4%)


### Pass text column through pipeline. 
**Note** Exluding Titles—these will serve as a form of validation of the story text topic modeling

In [80]:
data_lemmatized = process_data(df_sample['selftext'].tolist(), stop_words)

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:collected 1252940 word types from a corpus of 5425041 words (unigram + bigrams) and 3196 sentences
INFO:gensim.models.phrases:using 1252940 counts as vocab in Phrases<0 vocab, min_count=5, threshold=50, max_vocab_size=40000000>
INFO:gensim.models.phrases:source_vocab length 1252940
INFO:gensim.models.phrases:Phraser built with 2267 phrasegrams
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3196), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3196), HTML(value='')))




  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3196), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3196), HTML(value='')))


[['maddie', 'confuse', 'learn', 'type', 'legalize', 'marriage', 'ms', 'jone', 'confuse', 'nice', 'brother', 'sister', 'aunt', 'wait', 'mommy', 'dad', 'sister', 'ms', 'jone', 'type', 'exist', 'dinner', 'mommy', 'dad', 'cool', 'nice', 'mommy', 'dad', 'lie', 'sister', 'dad', 'dark', 'fast', 'concerned', 'whisper', 'impaginery', 'sikiatrist', 'dinner', 'upset', 'upstairs', 'sister', 'mad', 'crying', 'forget', 'sad', 'cry', 'ms', 'jone', 'kid', 'sad', 'kid', 'ms', 'jone', 'funny', 'smile', 'sister', 'breathe', 'sister', 'friend', 'sister', 'funny', 'touch', 'sister', 'whisper', 'whimper', 'pain', 'soooo', 'weird', 'ms', 'jone', 'sister', 'cry', 'wish', 'breathe', 'confused', 'dad', 'super', 'mad', 'mention', 'cry', 'sleep', 'anymore', 'basement', 'floor', 'shovel']]


In [81]:
docs_list_joined = [' '.join(words) for words in data_lemmatized]

### Grid Search for Best Topic Model

In [136]:
def display_topics(model, feature_names, no_top_words=15, topic_names=topics):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [65]:
def hyperparameter_topics(docs_list, vectorizer='count', num_topic=15,
                          binary=False, max_df=0.08, min_df=1, stop_words = 'english',
                         model='lda', no_top_words = 15):
    print('\n\n', model, vectorizer, f'num_topic:{num_topic}', f'binary:{binary}' , f'max_df:{max_df}', f'min_df:{min_df}')
    vec = None
    if vectorizer == 'count':
        vec = CountVectorizer(stop_words=stop_words, binary=binary, max_df=max_df, min_df=min_df)
    elif vectorizer == 'tfidf':
        vec = TfidfVectorizer(stop_words=stop_words, binary=binary, max_df=max_df, min_df=min_df)
        
        
    doc_word_matrix = vec.fit_transform(docs_list)
    feature_names = vec.get_feature_names()
    
    doc_topic = None
    mod = None
    if model == 'lda':
        # Convert sparse matrix of counts to a gensim corpus
        corpus = matutils.Sparse2Corpus(doc_word_matrix.transpose())
        # We need to save a mapping (dict) of row id to word (token) for later use by gensim:
        id2word = dict((v, k) for k, v in vec.vocabulary_.items())
        mod = LatentDirichletAllocation(n_components=num_topic,random_state=RANDOM_STATE)
        mod.fit(doc_word_matrix)
        pyLDAvis.sklearn.prepare(mod, doc_words_matrix, vec)
    elif model == 'lsa':
        mod = TruncatedSVD(num_topic,random_state=RANDOM_STATE)
        doc_topic = mod.fit_transform(doc_word_matrix)
        print('explained_variance_ratio_', lsa.explained_variance_ratio_)
        display_topics(mod, feature_names, no_top_words)
    elif model == 'nmf':
        mod = NMF(num_topic,random_state=RANDOM_STATE)
        doc_topic = mod.fit_transform(doc_word_matrix)
        display_topics(mod, feature_names, no_top_words)
        
    return (mod, vec, feature_names, doc_word_matrix, doc_topic)

In [66]:
no_top_words = 13

for num_topic in [5, 10, 15, 20, 25]: 
    for vectorizer in ['tfidf', 'count']: 
        for binary in [False, True]:
            for max_df in [.2,.35,.5]:
                for min_df in [3, 5, 15]:
                    for model in ['nmf','lda']:
                        hyperparameter_topics(docs_list_joined, 
                                             vectorizer=vectorizer, 
                                             binary=binary, 
                                             max_df=max_df, 
                                             min_df=min_df, 
                                             stop_words = stop_words,
                                             model=model,
                                             num_topic=num_topic,
                                             no_top_words = no_top_words
                                            )

       




 nmf tfidf num_topic:5 binary:True max_df:0.2 min_df:5

Topic  0
lip, tooth, grip, cheek, lean, lift, sharp, knee, slide, gasp, soft, bone, flesh, wet, nose, breathe, swallow, wrap, gaze, inch

Topic  1
text, freak, apartment, camera, screen, computer, type, click, definitely, creepy, hallway, tonight, laptop, upstairs, stair, movie, tomorrow, downstairs, couch, random

Topic  2
doctor, patient, attempt, learn, contact, subject, victim, effect, choose, perhaps, remove, truth, discover, truly, exist, murder, escape, suffer, trust, contain

Topic  3
baby, sister, dinner, daughter, brother, sad, hug, sweet, kiss, angry, upset, daddy, bear, beautiful, favorite, fun, perfect, husband, funny, mommy

Topic  4
tree, mile, forest, animal, distance, direction, path, sun, flashlight, dirt, surround, sky, yard, wind, truck, rock, mountain, pack, hill, branch


 nmf tfidf num_topic:5 binary:True max_df:0.2 min_df:15

Topic  0
lip, tooth, grip, cheek, lean, lift, sharp, knee, slide, gasp, soft, fl


Topic  0
breath, whisper, voice, darkness, ear, silence, gasp, shoulder, glance, grip, lean, gaze, chest, shake, echo, shadow, fear, slowly, slow, sigh

Topic  1
sleep, wake, bed, asleep, awake, bedroom, weird, bathroom, dream, freak, noise, scared, completely, scar, normal, dark, usually, figure, woke, nightmare

Topic  2
patient, perhaps, begin, attempt, doctor, discover, subject, learn, effect, exist, explain, victim, truly, suffer, choose, destroy, remove, truth, condition, none

Topic  3
mom, dad, cry, kid, sister, baby, girl, boy, brother, daughter, sad, anymore, daddy, hurt, bear, upset, hug, mommy, nice, smile

Topic  4
tree, wood, forest, mile, distance, path, animal, sun, sky, direction, wind, dirt, mountain, flashlight, surround, camp, foot, edge, rock, clearing

Topic  5
drink, seat, table, smile, laugh, nod, bar, finish, reply, store, nice, lunch, parking, shrug, apartment, wear, restaurant, conversation, pocket, dinner

Topic  6
stair, kitchen, lock, upstairs, bedroom, b


Topic  0
nod, sigh, lean, glance, seat, smile, pause, shoulder, whisper, nodded, voice, shrug, shake, chuckle, gaze, reply, tone, expression, frown, answer

Topic  1
freak, weird, creepy, ill, friend, scared, seriously, crazy, definitely, answer, honestly, yesterday, scar, explain, fuck, hell, mention, tomorrow, figure, freaking

Topic  2
soul, fear, truth, truly, exist, perhaps, earth, accept, existence, die, desire, presence, god, choose, evil, reality, imagine, grow, horror, suffer

Topic  3
mom, dad, cry, kid, sister, baby, brother, girl, boy, daddy, sad, daughter, hurt, anymore, mommy, bear, upset, grow, hug, smile

Topic  4
wake, asleep, sleep, bed, dream, awake, bedroom, nightmare, lie, tired, woke, bathroom, dark, slowly, fear, completely, lay, tonight, terrified, notice

Topic  5
tree, forest, wood, mile, distance, sky, path, direction, sun, camp, rock, mountain, wind, branch, foot, surround, edge, darkness, clearing, dirt

Topic  6
stair, kitchen, upstairs, lock, hallway, do


Topic  0
nod, sigh, lean, nodded, whisper, pause, smile, voice, shoulder, glance, shrug, shake, chuckle, sorry, laugh, frown, expression, answer, shout, speak

Topic  1
freak, weird, fuck, text, fucking, tonight, friend, seriously, ill, crazy, creepy, apartment, freaking, hell, girlfriend, scared, boyfriend, totally, tomorrow, definitely

Topic  2
location, station, locate, attempt, contact, radio, arrive, investigate, enter, transcript, identify, occur, weapon, equipment, base, begin, soldier, subject, discover, remove

Topic  3
baby, daughter, daddy, kiss, cry, girl, mommy, beautiful, hug, sweet, hair, boy, pregnant, smile, hurt, perfect, husband, giggle, bed, dress

Topic  4
tree, forest, wood, path, mile, branch, distance, clearing, camp, flashlight, hike, animal, dirt, direction, rock, sun, deer, edge, mountain, leaf

Topic  5
driver, seat, parking, truck, ride, passenger, highway, vehicle, mile, speed, tire, driving, brake, window, engine, headlight, wheel, rear, radio, backseat


Topic  0
sigh, lean, pause, shrug, glance, nodded, chuckle, frown, expression, gaze, grin, gasp, lip, softly, wave, interrupt, gesture, shout, pocket, seat

Topic  1
consider, apparently, mention, obviously, honestly, doubt, assume, conversation, absolutely, clearly, concern, surprise, odd, disturb, usual, honest, agree, definitely, chat, bother

Topic  2
soul, existence, accept, truly, earth, truth, desire, exist, choose, evil, suffer, forever, presence, fade, perhaps, reality, escape, fate, powerful, destroy

Topic  3
baby, daddy, daughter, mommy, kiss, hug, beautiful, sweet, sad, honey, crying, favorite, pregnant, toy, upset, bear, giggle, perfect, husband, sweetie

Topic  4
tree, forest, path, mile, branch, distance, camp, hike, dirt, clearing, sun, rock, direction, wind, animal, sky, mountain, leaf, bush, deer

Topic  5
hallway, awake, footstep, hall, freeze, shadow, breathing, creak, nightmare, louder, movement, closer, faint, slam, tired, slow, quietly, blanket, pound, closed




Topic  0
nod, sigh, lean, nodded, smile, shrug, shoulder, shake, pause, frown, chuckle, whisper, glance, expression, sorry, chair, lip, reply, seat, answer

Topic  1
freak, weird, creepy, ill, seriously, scared, crazy, friend, scar, honestly, definitely, yesterday, fuck, freaking, explain, tomorrow, advice, mention, totally, text

Topic  2
attempt, locate, subject, begin, location, occur, source, assume, remove, enter, effect, appear, contact, base, inform, experiment, equipment, contain, arrive, extremely

Topic  3
eat, dinner, cook, kitchen, meal, hungry, plate, meat, breakfast, table, cooking, dish, bowl, taste, delicious, egg, lunch, chicken, fridge, restaurant

Topic  4
darkness, glow, shadow, grip, gaze, echo, terror, breath, illuminate, dim, heavy, gasp, grasp, peer, frame, shriek, flicker, sharp, stretch, stumble

Topic  5
tree, forest, wood, path, mile, camp, clearing, branch, hike, distance, flashlight, animal, dirt, sun, deer, rock, bush, trail, direction, mountain

Topic  

# NLP on Full Dataset

## Full Data Preprocess 
(Caution — took hours on macbook pro)

In [67]:
df_full = pd.read_pickle(f'{DATA_PATH}/reddit_df_clean.pkl')

In [69]:
data_full_lemmatized = process_data(df_full['selftext'].tolist())

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 17949912 words and 2871400 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 34132921 words and 4383686 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 49493398 words and 5575488 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 63992630 words and 6576384 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 77804005 words and 7437971 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 91259683 words and 8231239 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 104087411 words and 8940176 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 116460087 words and 9583337 word types
INFO:

HBox(children=(IntProgress(value=0, max=165814), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=165814), HTML(value='')))




  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=165814), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=165814), HTML(value='')))


[['react', 'constricting', 'sit', 'baffle', 'unsure', 'internet', 'alone', 'rent', 'employ', 'army', 'door', 'dark', 'frame', 'sister', 'rot', 'floorboard', 'pile', 'trash', 'pale', 'glow', 'monitor', 'thinning', 'frame', 'concern', 'possibly', 'healthy', 'ignore', 'completely', 'raise', 'figure', 'raise', 'fund', 'greedy', 'habit', 'monitor', 'brightness', 'dimming', 'slowly', 'shut', 'sister', 'fragile', 'frame', 'weighing', 'option', 'puzzled', 'fee', 'shut', 'computer', 'settle', 'choice', 'display', 'slowly', 'fade', 'bath', 'shadow', 'precariously', 'switch', 'jagged', 'edge', 'slice', 'decaying', 'shoe', 'piece', 'flip', 'switch', 'shut', 'computer', 'sister', 'pull', 'lighter', 'tattered', 'pocket', 'combat', 'darkness', 'surrounding', 'shade', 'mold', 'downstairs', 'arrive', 'kitchen', 'confront', 'inaction', 'computer', 'fade', 'eat', 'computer', 'wage', 'stomach', 'hungry', 'eat', 'shy', 'confrontation', 'sit', 'trash', 'spot', 'floor', 'walk', 'store', 'eat', 'anyways', 'm

In [70]:
docs_full_list_joined = [' '.join(words) for words in data_full_lemmatized]

### Pickle

In [71]:
# too big for github
with open(f'{DATA_PATH}/full_lemmatized.pickle', 'wb') as f:
    pickle.dump(docs_full_list_joined, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open(f'{DATA_PATH}/full_lemmatized.pickle', 'rb') as f:
#     docs_full_list_joined_data = pickle.load(f)

## Fit small model with final params

In [111]:
# Final params from grid search
num_topic=22
min_df = 3
max_df = .5
no_top_words = 22
binary=True
final_model, final_vec, feature_names, doc_word_matrix, doc_topic_sub = hyperparameter_topics(docs_list_joined, 
                                                                                     vectorizer=vectorizer, 
                                                                                     binary=binary, 
                                                                                     max_df=max_df, 
                                                                                     min_df=min_df, 
                                                                                     stop_words = stop_words,
                                                                                     model='nmf',
                                                                                     num_topic=num_topic,
                                                                                     no_top_words = no_top_words
                                                                                    )



Topic  0
nod, sigh, lean, nodded, smile, shrug, glance, shoulder, frown, chuckle, pause, chair, whisper, shake, grin, expression, sorry, seat, gaze, laugh, lip, wave

Topic  1
freak, weird, scared, creepy, ill, scar, crazy, fuck, seriously, freaking, friend, fucking, text, yesterday, tomorrow, hell, tonight, scare, totally, scary, advice, definitely

Topic  2
locate, location, attempt, soldier, base, subject, source, experiment, weapon, enter, contact, begin, remove, contain, facility, effect, equipment, operation, occur, mission, size, appear

Topic  3
baby, daddy, cry, mommy, daughter, girl, kiss, boy, hug, hurt, beautiful, hair, sweet, favorite, smile, crying, toy, nice, giggle, arm, pregnant, sad

Topic  4
tree, forest, wood, path, mile, branch, distance, camp, clearing, flashlight, hike, dirt, animal, rock, sun, direction, edge, deer, leaf, trail, bush, wind

Topic  5
driver, seat, truck, parking, highway, passenger, ride, vehicle, mile, brake, speed, tire, window, wheel, driving

In [139]:
topics = ['Subtle Cues',
       'True Confessional', 'Experimental Facility', 'Newborn Child', 'Wilderness', 'Automotive',
       "In The House", 'Violent Action', 'Bones & Flesh', 'Medical', 'Technology', 'Party Hard',
       'Neighborhood', "Food", 'Night Terrors', 'Family', 'What Was That Sound?', 'Evil Spirit', 'Schools Out Forever',
       'The Ocean', 'Contemplation', 'Crime & Punishment']

## save word strengths

In [159]:
def display_t(model, feature_names, no_top_words=15, topic_names=topics):
    cols = ['word', 'strength', 'topic']
    df = pd.DataFrame(columns=cols)
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        df2 = pd.DataFrame([[feature_names[i], topic[i], topic_names[ix]] for i in topic.argsort()[:-no_top_words - 1:-1]], columns=cols)
        print(", ".join(df2['word'].tolist()))
        df = df.append(df2)
    return df

In [160]:
word_strengths = display_t(final_model, feature_names, no_top_words, topic_names=topics)


Topic: ' Subtle Cues '
nod, sigh, lean, nodded, smile, shrug, glance, shoulder, frown, chuckle, pause, chair, whisper, shake, grin, expression, sorry, seat, gaze, laugh, lip, wave

Topic: ' True Confessional '
freak, weird, scared, creepy, ill, scar, crazy, fuck, seriously, freaking, friend, fucking, text, yesterday, tomorrow, hell, tonight, scare, totally, scary, advice, definitely

Topic: ' Experimental Facility '
locate, location, attempt, soldier, base, subject, source, experiment, weapon, enter, contact, begin, remove, contain, facility, effect, equipment, operation, occur, mission, size, appear

Topic: ' Newborn Child '
baby, daddy, cry, mommy, daughter, girl, kiss, boy, hug, hurt, beautiful, hair, sweet, favorite, smile, crying, toy, nice, giggle, arm, pregnant, sad

Topic: ' Wilderness '
tree, forest, wood, path, mile, branch, distance, camp, clearing, flashlight, hike, dirt, animal, rock, sun, direction, edge, deer, leaf, trail, bush, wind

Topic: ' Automotive '
driver, seat,

In [162]:
word_strengths.to_pickle(f'{PICKLE_PATH}/word_strengths.pkl')

## Transform Full Data Set

In [115]:
# Vectorize with same vectorizer we used for data subset 
doc_word_matrix_x = final_vec.transform(docs_full_list_joined)
# Transform with same model we used for data subset
doc_topic_x = final_model.transform(doc_word_matrix_x)


In [117]:
# Join Full dataframe with topic strengths — which aare same order as original dataframe
doc_topic_df = pd.DataFrame(doc_topic_x, columns=topics)
df_full = df_full.reset_index(drop=True)
df_full_topics = pd.concat([df_full, doc_topic_df], axis=1)


### Pickle

In [131]:
df_full = df_full.reset_index(drop=True)

In [133]:
df_full_topics = pd.concat([df_full, doc_topic_df], axis=1)

In [24]:
# too big for github, store in datapath
df_full_topics.to_pickle(f'{DATA_PATH}/full_docs_topics_w_text.pkl')

In [25]:
# save without text, for viz and github
df_full_topics.drop('selftext', axis=1).to_pickle(f'{PICKLE_PATH}/full_docs_topics.pkl')

# Results in to [ Viz Notebook](reddit_nosleep_serve_viz.ipynb)