In [1]:
import os 
import pandas as pd

# for comprehension of language
import spacy 
from spacy import displacy

# for topics modeling
import gensim 
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel


## Loading your text and making it a corpus

#### First we need to load the text

In [3]:
tiktok_influencer =  pd.read_csv("../data/transcripts_demure.csv")

print(len(tiktok_influencer))
tiktok_influencer.head()

7


Unnamed: 0,file_name,transcript
0,../data/@joolieannie_7404929915893681451.mp4,"Ladies, let's be mindful when we use our phon..."
1,../data/@joolieannie_1724362477829.mp4,give me one that's like the size of like a fi...
2,../data/@joolieannie_1723610748575.mp4,You see how I do my makeup for work? Very dem...
3,../data/@joolieannie_1724324953244.mp4,Divas I'm in Los Angeles and Zillow needs my ...
4,../data/@joolieannie_1724362572281.mp4,"Hi, Tvaz. Okay, so I've been going to the sam..."


In [5]:
transcript_list = tiktok_influencer["transcript"].tolist()

text = ' '.join(str(x) for x in transcript_list)

In [6]:
len(text)

4104

#### Then we can load spaCy's English language trained pipeline

`A training pipeline typically reads training data from a feature store, performs model-dependent transformations, trains the model, and evaluates the model before the model is saved to a model registry.`

In [7]:
nlp = spacy.load('es_core_news_sm')

#### Stop words

A lot of languages also contain 'stop words', words that are used very frequently and may not be useful when we're evaluating how often certain words may be used. spaCy has niftyfunctions that allow us to designate stop words for our analysis. 

For this purpose, we got stopwords [here](https://github.com/stopwords-iso/stopwords-es).

In [9]:
with open("../data/stopwords.txt", "r") as file:
    stop_words = file.read().split("\n")
len(stop_words)

430

In [10]:
for stopword in stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

    
doc = nlp(text)

In [11]:
len(doc)

945

In [14]:
sent = nlp("so I've been going to the same nail salon for like forever, but I just think it turns like mix it up I want to get my nails done tomorrow morning Friday morning, that's tomorrow morning I'm looking for a nail tech who has availability in Chicago.")
## 

## Computational Linguistics

#### POS-Tagging — (Part Of Speech)
spaCy has a a nifty way to look into how each word is used in a sentence, often also referred to as Part Of Speech (POS). There are eight main parts of speech — nouns, pronouns, adjectives, verbs, adverbs, prepositions, conjunctions, and interjections. 

In [15]:
for token in sent:
    print(token.text, token.pos_, token.tag_)

so ADV ADV
I've NOUN NOUN
been VERB VERB
going PROPN PROPN
to PROPN PROPN
the PROPN PROPN
same PROPN PROPN
nail PROPN PROPN
salon PROPN PROPN
for PROPN PROPN
like PROPN PROPN
forever PROPN PROPN
, PUNCT PUNCT
but NOUN NOUN
I NOUN NOUN
just VERB VERB
think PROPN PROPN
it AUX AUX
turns NOUN NOUN
like NOUN NOUN
mix ADV ADV
it VERB VERB
up PROPN PROPN
I CCONJ CCONJ
want PROPN PROPN
to PROPN PROPN
get PROPN PROPN
my PRON PRON
nails ADJ ADJ
done VERB VERB
tomorrow PROPN PROPN
morning NOUN NOUN
Friday PROPN PROPN
morning PROPN PROPN
, PUNCT PUNCT
that's PROPN PROPN
tomorrow VERB VERB
morning PROPN PROPN
I'm ADJ ADJ
looking PROPN PROPN
for PROPN PROPN
a ADP ADP
nail PROPN PROPN
tech PROPN PROPN
who PROPN PROPN
has AUX AUX
availability VERB VERB
in ADP ADP
Chicago PROPN PROPN
. PUNCT PUNCT


#### NER-Tagging — (Named Entity Recognition)
Named-entity recognition (NER) is a subtask of information extraction that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.



In [16]:
for token in sent:
    print(token.text, token.ent_type_)

so 
I've 
been MISC
going MISC
to MISC
the MISC
same MISC
nail MISC
salon MISC
for MISC
like MISC
forever MISC
, 
but MISC
I MISC
just 
think 
it 
turns 
like MISC
mix MISC
it MISC
up MISC
I MISC
want MISC
to MISC
get MISC
my MISC
nails MISC
done MISC
tomorrow MISC
morning MISC
Friday MISC
morning MISC
, 
that's PER
tomorrow PER
morning PER
I'm 
looking 
for 
a 
nail 
tech 
who 
has 
availability MISC
in MISC
Chicago MISC
. 


Let's run a slightly different version of this code to see what role these things play:

In [17]:
for ent in sent.ents:
    print(ent.text, ent.label_)

been going to the same nail salon for like forever MISC
but I MISC
like mix it up I want to get my nails done tomorrow morning Friday morning MISC
that's tomorrow morning PER
availability in Chicago MISC


You can render these roles visually, too:

In [18]:
displacy.render(sent, style='ent', jupyter=True)

#### Dependency Parsing
The term Dependency Parsing (DP) refers to the process of examining the dependencies between the phrases of a sentence in order to determine its grammatical structure.

In [19]:

for chunk in sent.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

I've I've nsubj been
going going obj been
to the same nail to nsubj been
for for nsubj salon
, but I but obl salon
think think obj just
like like obj turns
up I want to get my nails up nsubj turns
tomorrow morning Friday morning tomorrow obj done
that's tomorrow morning I'm looking for a nail tech who that's nsubj availability
Chicago Chicago obj availability


In [20]:
for token in sent:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
         [child for child in token.children])

so advmod been VERB []
I've nsubj been VERB []
been ccomp salon PROPN [so, I've, going, to]
going obj been VERB []
to nsubj been VERB [the, same, nail]
the flat to PROPN []
same flat to PROPN []
nail flat to PROPN []
salon ROOT salon PROPN [been, for, like, forever, but, just, turns, done, ,, availability, .]
for nsubj salon PROPN []
like flat salon PROPN []
forever flat salon PROPN []
, punct but NOUN []
but obl salon PROPN [,, I]
I flat but NOUN []
just xcomp salon PROPN [think]
think obj just VERB []
it aux turns NOUN []
turns advcl salon PROPN [it, like, mix, it, up]
like obj turns NOUN []
mix advmod turns NOUN []
it advcl turns NOUN []
up nsubj turns NOUN [I, want, nails]
I flat up PROPN []
want flat up PROPN [to, get, my]
to flat want PROPN []
get flat want PROPN []
my flat want PROPN []
nails amod up PROPN []
done advcl salon PROPN [tomorrow]
tomorrow obj done VERB [morning, Friday]
morning flat tomorrow PROPN []
Friday appos tomorrow PROPN [morning]
morning flat Friday PROPN []

In [21]:
displacy.render(sent, style='dep', jupyter=True, options={'distance':90})


## Data cleaning
The next few lines get rid of stopwords and punctuation markers, and add lemmatized words.

In [17]:
for word in doc:
    print(word.lemma_)
    break

 


In [22]:
# We add some words to the stop word list
texts = []
article = []

for word in doc:
    if word.text != '\n' and not word.is_stop and not word.is_punct\
                         and not word.like_num and word.text != 'I':
        article.append(word.lemma_)
        
    # if word.text == '\n':
        texts.append(article)
        article = []
        
print(texts[0], len(texts))

[' '] 359


In [23]:
texts

[[' '],
 ['Ladies'],
 ["let's"],
 ['mindful'],
 ['phón'],
 ['You'],
 ['cutesy'],
 ['demure'],
 ['reply'],
 ['text'],
 ['emails'],
 ['picky'],
 ['picky'],
 ['flicky'],
 ['flicky'],
 ["flicky's"],
 ['And'],
 ["that's"],
 ["i'm"],
 ['Verizon'],
 ['Verizon'],
 ['tradar'],
 ['musty'],
 ['diva'],
 ['demure'],
 ['diva'],
 ['Verizon'],
 ['trade'],
 ['crusty'],
 ['phonir'],
 ['We'],
 ["don't"],
 ['crunchy'],
 ['phonar'],
 ["don't"],
 ['crack'],
 ['screar'],
 ["i'm"],
 ['typing'],
 ['phonir'],
 ['getting'],
 ['cuts'],
 ['finger'],
 ["i'm"],
 ['charging'],
 ['phonar'],
 ["it'"],
 ['crazy'],
 ['Thank'],
 ['You'],
 ["we'd"],
 ['bag'],
 ['bag'],
 ["i'm"],
 ['cute'],
 ["I'm"],
 ['respectful'],
 ['staff'],
 ["don't"],
 ['crazy'],
 ['walk'],
 ['nizar'],
 ['phonar'],
 ['And'],
 ["that's"],
 ['partner'],
 ['Verizon'],
 ['She'],
 ['elegant'],
 ['cutesy'],
 ['classy'],
 ['She'],
 ['red'],
 ["doesn't"],
 ['hot'],
 ['pink'],
 ["she's"],
 ['crazy'],
 ["she's"],
 ['demure'],
 [' '],
 ["that's"],
 ['size'],
 ['

Sometimes Topic Modeling makes more sense when New and York are treated as New York - we can do this by creating a bigram model and modifying our corpus accordingly.

In [24]:
bigram = gensim.models.phrases.Phrases(texts)
texts = [bigram[line] for line in texts]
texts = [bigram[line] for line in texts]
print(texts[0])

[' ']


In [25]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[1])

[(1, 1)]


## Different Kinds of Topic Modeling 
Topic Modeling refers to the probabilistic modeling of text documents as topics. Gensim remains the most popular library to perform such modeling, and we will be using it to perform our Topic Modeling.

#### LSI — Latent Semantic Indexing
LSI stands for Latent Semantic Indexing — It is a popular information retrieval (IR) method that works by decomposing the original matrix of words to maintain key topics.

In [26]:
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi_model.show_topics(num_topics=15)

[(0,
  '1.000*"demure" + 0.001*"Zillow" + -0.001*"balcony" + 0.001*"availability" + -0.001*"check" + 0.001*"bush" + 0.001*"crack" + 0.001*"makeup" + -0.001*"Hollywood" + -0.001*"giving"'),
 (1,
  '1.000*"i\'m" + -0.001*"fan" + 0.001*"houses" + -0.001*"Tvaz" + 0.001*"watching" + 0.001*"murid" + -0.001*"tradar" + 0.001*"Hi" + -0.001*"makeup" + 0.001*"giving"'),
 (2,
  '0.810*"You" + 0.587*"mindful" + -0.001*"modest" + 0.001*"charging" + 0.001*"They\'re" + -0.001*"red" + -0.001*"hit" + -0.001*"crusty" + -0.001*"watching" + -0.001*"But"'),
 (3,
  '0.810*"mindful" + -0.587*"You" + 0.002*"guidance" + 0.001*"emails" + 0.001*"clown" + 0.001*"trade" + 0.001*"salon" + -0.001*"i\'ve" + -0.001*"sweet" + -0.001*"til"'),
 (4,
  '-0.890*"don\'t" + 0.456*" " + 0.002*"lot" + -0.002*"cho él" + -0.002*"houses" + 0.002*"queir" + 0.002*"makeup" + -0.002*"coming" + 0.001*"help" + 0.001*"cuts"'),
 (5,
  '0.890*" " + 0.456*"don\'t" + -0.003*"finger" + -0.002*"text" + -0.002*"red" + 0.002*"hygienic" + -0.002*"

#### HDP — Hierarchical Dirichlet Process
HDP, the Hierarchical Dirichlet Process is an unsupervised Topic Model which figures out the number of topics on its own.

In [27]:
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
hdp_model.show_topics()[:5]

[(0,
  '0.028*emails + 0.028*Marge + 0.024*world + 0.022*crunchy + 0.020*Hi + 0.019*charging + 0.017*cheaty + 0.017*bag + 0.014*look + 0.014*queir + 0.014*cho él + 0.013*phonir + 0.012*giving + 0.012*house + 0.012*color + 0.012*You + 0.012*finding + 0.012*red + 0.011*The + 0.011*namir'),
 (1,
  '0.024*guidance + 0.022*simple + 0.020*Hollywood + 0.020*bed + 0.019*obviously + 0.019*five + 0.018*coming + 0.017*Not + 0.017*cut + 0.016*beddy + 0.016*modest + 0.016*fan + 0.015*flicky + 0.014*noise + 0.014*phonar + 0.014*Selma + 0.014*lot + 0.014*screar + 0.013*drive + 0.013*gonna'),
 (2,
  "0.040*fan + 0.032*reply + 0.030*balcony + 0.025*be + 0.023*quality + 0.017*tradar + 0.016*i'll + 0.016*liv + 0.015*lot + 0.015*Tvaz + 0.015*coming + 0.015*parties + 0.014*little + 0.014*  + 0.014*crease + 0.013*nizar + 0.013*ice-water + 0.012*hid + 0.012*OMG + 0.012*hoús"),
 (3,
  '0.027*hit + 0.023*  + 0.020*typing + 0.019*classy + 0.017*noise + 0.016*crack + 0.016*diva + 0.015*simple + 0.015*demure + 0.

#### LDA — Latent Dirichlet Allocation
LDA or Latent Dirichlet Allocation is arguably the most famous Topic Modeling algorithm out there. Out here we create a simple Topic Model with 10 topics.

In [28]:
lda_model = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)
lda_model.show_topics()

[(0,
  '0.078*"i\'m" + 0.040*"diva" + 0.032*"elegant" + 0.032*"nail" + 0.032*"looking" + 0.025*" " + 0.025*"She" + 0.025*"Verizon" + 0.025*"that\'s" + 0.017*"it\'"'),
 (1,
  '0.087*"You" + 0.061*"don\'t" + 0.036*"I\'m" + 0.035*"real" + 0.019*"glibadi" + 0.019*"morning" + 0.019*"nizar" + 0.019*"phonir" + 0.019*"Mina" + 0.019*"tag"'),
 (2,
  '0.079*"demure" + 0.070*"mindful" + 0.044*"cutesy" + 0.036*"let\'s" + 0.028*" " + 0.027*"cozy" + 0.019*"gonna" + 0.019*"nails" + 0.019*"bag" + 0.019*"walk"'),
 (3,
  '0.052*"watch" + 0.032*"And" + 0.032*"she\'s" + 0.032*"phonar" + 0.023*"mindful" + 0.022*"So" + 0.022*"tomorrow" + 0.022*"house" + 0.022*"interview" + 0.012*"Verizon"'),
 (4,
  '0.047*"crazy" + 0.038*"ugly" + 0.038*"beddy" + 0.038*"girls" + 0.029*"look" + 0.029*"cute" + 0.021*"demure" + 0.020*"tech" + 0.020*"invested" + 0.020*"Not"')]

## Visualizing Topics with pyLDAvis
pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization.

The visualization is intended to be used within an IPython notebook but can also be saved to a stand-alone HTML file for easy sharing.

In [29]:
#for visualizations
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()


In [30]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)


In [37]:
pyLDAvis.save_html(vis, "../output/topics_modeling_demure.html")

In [39]:
# vis

## Word count

In [40]:
words_influencer = pd.DataFrame(texts)
print(len(words_influencer))
words_influencer.columns = ["word"]
words_influencer.head()


359


Unnamed: 0,word
0,
1,Ladies
2,let's
3,mindful
4,phón


In [41]:
word_tally = words_influencer["word"].value_counts()
word_tally.head()

word
demure     12
i'm        11
mindful    10
You        10
            7
Name: count, dtype: int64

In [42]:
word_tally.to_csv("../output/word_tally_demure.csv")