In [2]:
import pandas as pd
import numpy as np
import pickle
import nltk
import re
import string
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
from collections import Counter
import scipy.sparse as ss
from sklearn import datasets
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# Read in the transcripts
with open('transcripts.pickle','rb') as read_file:
    transcripts = pickle.load(read_file)

## Create df of 5mins of text/ row

In [73]:
def bin_5min_blocks(df, key):
    text_list = []
    time_list = []
    key_list = []
    i=0
    while i < len(df):
        current_time = df.start.iloc[i]
        current_text = ''
        while (i < len(df)) and (df.start.iloc[i] <= current_time+300):
            current_text = current_text + ' ' + df.text.iloc[i]
            i +=1
        time_list.append(current_time)
        text_list.append(current_text)
        key_list.append(key)
    return pd.DataFrame(list(zip(key_list, time_list, text_list)),\
                        columns=['episode', 'time','text'])

In [80]:
binned_df = pd.DataFrame(columns=['episode','time','text'])
for key in transcripts.keys():
    binned_df = binned_df.append(bin_5min_blocks(pd.DataFrame(transcripts[key]), key))

In [81]:
binned_df.shape

(2964, 3)

In [82]:
# Pickle the 5 min bin df
with open('binned_df.pickle', 'wb') as to_write:
    pickle.dump(binned_df, to_write)

# spaCy

In [14]:
# Read in the bin df
with open('binned_df.pickle','rb') as read_file:
    binned_df = pickle.load(read_file)

binned_df = binned_df.reset_index().drop(columns='index')
binned_df

  and should_run_async(code)


Unnamed: 0,episode,time,text
0,xXXIACCJ2io,60.399,welcome to the podcast that's dedicated to ma...
1,xXXIACCJ2io,360.479,i mean why not the floor is ours another thin...
2,xXXIACCJ2io,661.760,comprehensive understanding of my own injurie...
3,xXXIACCJ2io,962.399,that and you probably never did it again so t...
4,xXXIACCJ2io,1263.360,assumption of what's going on with my body bu...
...,...,...,...
2959,eWZVAazaDco,2711.030,like I need some something here and and so it...
2960,eWZVAazaDco,3012.300,of that is if you ever watched cross country ...
2961,eWZVAazaDco,3313.559,not like he was a tactical magician but peopl...
2962,eWZVAazaDco,3614.590,had a you know I had our day off within the l...


In [3]:
# Load spaCy's english core module

nlp = spacy.load('en_core_web_sm')

### Tokenize

In [6]:
# Tokenization (takes 5-10 min to run)

binned_df['spacy_doc'] = list(nlp.pipe(binned_df.text))

# to tokenize one string:
# doc = nlp(text)

In [7]:
# Pickle the 5 min bin tokenized text (1.59 GB)
with open('token_5min_df.pickle', 'wb') as to_write:
    pickle.dump(binned_df, to_write)

In [13]:
# Read in the 5 min tokenized text
with open('token_5min_df.pickle','rb') as read_file:
    binned_df = pickle.load(read_file)

  and should_run_async(code)


In [41]:
binned_df

Unnamed: 0,episode,time,text,spacy_doc
0,xXXIACCJ2io,60.399,welcome to the podcast that's dedicated to ma...,"( , welcome, to, the, podcast, that, 's, dedic..."
1,xXXIACCJ2io,360.479,i mean why not the floor is ours another thin...,"( , i, mean, why, not, the, floor, is, ours, a..."
2,xXXIACCJ2io,661.760,comprehensive understanding of my own injurie...,"( , comprehensive, understanding, of, my, own,..."
3,xXXIACCJ2io,962.399,that and you probably never did it again so t...,"( , that, and, you, probably, never, did, it, ..."
4,xXXIACCJ2io,1263.360,assumption of what's going on with my body bu...,"( , assumption, of, what, 's, going, on, with,..."
...,...,...,...,...
2959,eWZVAazaDco,2711.030,like I need some something here and and so it...,"( , like, I, need, some, something, here, and,..."
2960,eWZVAazaDco,3012.300,of that is if you ever watched cross country ...,"( , of, that, is, if, you, ever, watched, cros..."
2961,eWZVAazaDco,3313.559,not like he was a tactical magician but peopl...,"( , not, like, he, was, a, tactical, magician,..."
2962,eWZVAazaDco,3614.590,had a you know I had our day off within the l...,"( , had, a, you, know, I, had, our, day, off, ..."


### Put Nouns and Adjectives in a string for each doc

In [40]:
def key_words(row):
    current_string = ''
    for token in row['spacy_doc']:
        if (token.is_stop == False) and ((token.pos_ == 'NOUN') or (token.pos_ == 'ADJ')):
            current_word = token.lemma_
            current_string = current_string + current_word + ' '
    return current_string

In [42]:
binned_df['key_words'] = binned_df.apply(key_words ,axis=1)

# Vectorize

In [44]:
vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b')
dtm = vectorizer.fit_transform(binned_df.key_words)

# don't actually need a df of the vectorized data...
#dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names())

In [46]:
dtm_df.shape

(2964, 13981)

# Fit LDA Model

In [47]:
lda = LatentDirichletAllocation(n_components=20, random_state=0)
lda.fit(dtm)

LatentDirichletAllocation(n_components=20, random_state=0)

### Pickle Model, dtm and vectorizer

In [49]:
# Pickle the LDA Model, dtm, and vectorizer

mmv = [lda, dtm, vectorizer]

with open('mmv_5min.pickle', 'wb') as to_write:
    pickle.dump(mmv, to_write)

In [3]:
# Read in the LDA MOdel
with open('mmv_5min.pickle','rb') as read_file:
    mmv = pickle.load(read_file)

lda = mmv[0]
dtm = mmv[1]
vectorizer = mmv[2]

### pyLDAvis

In [4]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, dtm, vectorizer, mds='mmds')

In [19]:
# Get topic weights for each document
weights = lda.transform(dtm)

# make weight readable
topic_list = ['welcome','training','power_time','race','nutrition','terrain','medical','road_tactics','road_equip','technique',\
              'test_optim','heat','intervals','physiology','cadence','strength','mtb_equip','marginal_gains','protein','warmup']
weights_df = pd.DataFrame(weights, columns = topic_list)
doc_topic = pd.concat([binned_df[['episode','time']], weights_df], axis=1)


  and should_run_async(code)


In [50]:
doc_topic

  and should_run_async(code)


Unnamed: 0,episode,time,welcome,training,power_time,race,nutrition,terrain,medical,road_tactics,...,test_optim,heat,intervals,physiology,cadence,strength,mtb_equip,marginal_gains,protein,warmup
0,xXXIACCJ2io,60.399,0.845051,0.000208,0.000208,0.000208,0.000208,0.000208,0.000208,0.000208,...,0.110485,0.040922,0.000208,0.000208,0.000208,0.000208,0.000208,0.000208,0.000208,0.000208
1,xXXIACCJ2io,360.479,0.519756,0.066363,0.000258,0.363240,0.000258,0.014569,0.000258,0.000258,...,0.000258,0.000258,0.000258,0.000258,0.032206,0.000258,0.000258,0.000258,0.000258,0.000258
2,xXXIACCJ2io,661.760,0.100954,0.000259,0.000259,0.894383,0.000259,0.000259,0.000259,0.000259,...,0.000259,0.000259,0.000259,0.000259,0.000259,0.000259,0.000259,0.000259,0.000259,0.000259
3,xXXIACCJ2io,962.399,0.000243,0.000243,0.000243,0.398645,0.207422,0.057193,0.000243,0.000243,...,0.000243,0.000243,0.000243,0.000243,0.213006,0.000243,0.056443,0.000243,0.000243,0.000243
4,xXXIACCJ2io,1263.360,0.309381,0.000269,0.158191,0.067313,0.112894,0.078968,0.000269,0.000269,...,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269,0.000269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2959,eWZVAazaDco,2711.030,0.186967,0.000251,0.000251,0.124950,0.060717,0.000251,0.000251,0.000251,...,0.000251,0.000251,0.000251,0.021842,0.000251,0.000251,0.000251,0.000251,0.004754,0.000251
2960,eWZVAazaDco,3012.300,0.074200,0.159856,0.000267,0.000267,0.000267,0.000267,0.000267,0.019113,...,0.164314,0.276957,0.000267,0.000267,0.000267,0.000267,0.000267,0.015913,0.000267,0.000267
2961,eWZVAazaDco,3313.559,0.000301,0.000301,0.000301,0.000301,0.000301,0.000301,0.018603,0.000301,...,0.000301,0.000301,0.000301,0.000301,0.767786,0.000301,0.000301,0.000301,0.000301,0.000301
2962,eWZVAazaDco,3614.590,0.000273,0.192212,0.000273,0.508347,0.000273,0.000273,0.000273,0.000273,...,0.000273,0.000273,0.000273,0.000273,0.294796,0.000273,0.000273,0.000273,0.000273,0.000273


In [None]:
# what are the words that make up each topic?

In [51]:

tmp_df = pd.DataFrame(lda.components_, columns=vectorizer.get_feature_names()).iloc[:,40:60]
tmp_df

  and should_run_async(code)


Unnamed: 0,abut,academia,academic,academy,acai,accelerate,acceleration,accelerometer,accent,acceptable,acceptance,accepted,accepting,access,accessibility,accessible,accessory,accident,acclamation,acclimate
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1.404892,1.266594,0.05,0.05,0.05,41.657685,0.05,0.05,2.05,11.142818,0.05,0.05
1,0.05,1.483498,0.05,0.05,0.05,0.05,0.05,0.05,0.05,4.505425,0.05,0.05,0.05,7.238983,4.05,8.452929,0.05,0.05,0.05,0.05
2,0.05,0.05,0.05,0.05,0.05,0.05,1.969407,0.05,1.504084,0.05,0.05,2.05,0.05,0.05,0.05,0.05,0.05,2.338619,0.05,2.05
3,0.05,0.05,1.358675,1.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,34.801591,0.05,2.879413,0.05,0.05,0.05,0.05
4,1.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1.569244,0.05,0.05,0.05,5.763489,4.271124,0.05
5,0.05,0.05,0.05,0.05,0.05,0.05,5.184007,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
6,0.05,0.05,0.05,0.05,0.05,0.05,2.001627,1.759844,0.05,0.05,0.05,0.05,0.05,19.870067,0.05,2.749494,0.05,0.05,0.05,0.05
7,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
8,0.05,0.05,0.05,0.05,0.05,1.272337,37.314753,4.130265,3.241024,1.404629,0.05,0.05,2.05,0.102153,0.05,6.841409,0.05,5.108672,0.05,0.05
9,0.05,0.05,0.05,0.05,0.05,6.634516,57.789303,0.05,0.05,0.05,0.05,0.05,1.05,4.142433,0.05,1.436607,0.05,0.05,0.05,0.05


In [None]:
tmp_df.iloc[0,:]