In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
import re
import string
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
from collections import Counter
import scipy.sparse as ss
from sklearn import datasets
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Read in the transcripts
with open('transcripts.pickle','rb') as read_file:
    transcripts = pickle.load(read_file)

## Create df of 5mins of text/ row

In [None]:
def bin_5min_blocks(df, key):
    text_list = []
    time_list = []
    key_list = []
    i=0
    while i < len(df):
        current_time = df.start.iloc[i]
        current_text = ''
        while (i < len(df)) and (df.start.iloc[i] <= current_time+300):
            current_text = current_text + ' ' + df.text.iloc[i]
            i +=1
        time_list.append(current_time)
        text_list.append(current_text)
        key_list.append(key)
    return pd.DataFrame(list(zip(key_list, time_list, text_list)),\
                        columns=['episode', 'time','text'])

In [None]:
binned_df = pd.DataFrame(columns=['episode','time','text'])
for key in transcripts.keys():
    binned_df = binned_df.append(bin_5min_blocks(pd.DataFrame(transcripts[key]), key))

In [None]:
binned_df.shape

In [None]:
# Pickle the 5 min bin df
with open('binned_df.pickle', 'wb') as to_write:
    pickle.dump(binned_df, to_write)

# spaCy

In [None]:
# Read in the bin df
with open('binned_df.pickle','rb') as read_file:
    binned_df = pickle.load(read_file)

binned_df = binned_df.reset_index().drop(columns='index')

In [None]:
# Load spaCy's english core module

nlp = spacy.load('en_core_web_sm')

### Tokenize

In [None]:
# Tokenization (takes 5-10 min to run)

binned_df['spacy_doc'] = list(nlp.pipe(binned_df.text))

# to tokenize one string:
# doc = nlp(text)

In [None]:
# Pickle the 5 min bin tokenized text (1.59 GB)
with open('token_5min_df.pickle', 'wb') as to_write:
    pickle.dump(binned_df, to_write)

In [3]:
# Read in the 5 min tokenized text
with open('token_5min_df.pickle','rb') as read_file:
    binned_df = pickle.load(read_file)

### Put Nouns and Adjectives in a string for each doc

In [4]:
def key_words(row):
    current_string = ''
    for token in row['spacy_doc']:
        if (token.is_stop == False) and ((token.pos_ == 'NOUN') or (token.pos_ == 'ADJ')):
            current_word = token.lemma_
            current_string = current_string + current_word + ' '
    return current_string

In [5]:
binned_df['key_words'] = binned_df.apply(key_words ,axis=1)

# Vectorize

In [6]:
vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df=0.3)
dtm = vectorizer.fit_transform(binned_df.key_words)

# Fit LDA Model

In [7]:
lda = LatentDirichletAllocation(n_components=16, random_state=0)
doc_topic = lda.fit_transform(dtm)

### Pickle Model, dtm and vectorizer

In [8]:
# Pickle the LDA Model, dtm, and vectorizer

mmv = [lda, dtm, vectorizer, doc_topic]

with open('mmv_5min.pickle', 'wb') as to_write:
    pickle.dump(mmv, to_write)

In [9]:
# Read in the LDA MOdel
with open('mmv_5min.pickle','rb') as read_file:
    mmv = pickle.load(read_file)

lda = mmv[0]
dtm = mmv[1]
vectorizer = mmv[2]
doc_topic = mmv[3]

### pyLDAvis

In [10]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, dtm, vectorizer, mds='mmds')

In [11]:
# Get top words for each  topic
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

  and should_run_async(code)


In [12]:
display_topics(lda, vectorizer.get_feature_names(), 16)


Topic  0
weight, strength, muscle, pound, fat, leg, fiber, strong, endurance, calorie, cyclist, type, watt, mass, gym, force

Topic  1
coach, thank, cycling, awesome, trainerroad, group, episode, live, trainer, video, cyclist, team, fun, new, forum, pro

Topic  2
plan, volume, recovery, stress, base, fitness, season, event, intensity, sweet, life, important, phase, month, rest, build

Topic  3
food, bottle, calorie, gram, carbohydrate, water, nutrition, fat, gel, station, carb, caffeine, protein, morning, aid, meal

Topic  4
mountain, climb, mile, cross, section, flat, trail, skill, track, technical, effort, group, speed, rider, stage, fun

Topic  5
test, ramp, collagen, breath, breathing, air, cold, allergy, mouth, nose, asthma, nitrate, interesting, meat, result, number

Topic  6
team, rider, group, field, line, lap, wheel, sprint, turn, breakaway, corner, crit, wind, speed, sprinter, attack

Topic  7
muscle, study, blood, cell, effect, performance, exercise, fat, endurance, brain, 

  and should_run_async(code)


In [15]:
# make weight readable
topic_list = ['strength','tr_chat','training_plan_structure','nutrition',\
              'mtb_racing','testing_medical','road_racing','physiology',\
              'interuptions','workout_structure','bike_mechanics','trainer_technical',\
              'riding_technical','heat_sweat','special_physical_cases','goals']
doc_topic_df = pd.DataFrame(doc_topic, columns = topic_list)
doc_topic_df = pd.concat([binned_df[['episode','time']], doc_topic_df], axis=1)

  and should_run_async(code)
