In [1]:
import pandas as pd

In [2]:
data1 = pd.read_csv('data/2017firsthalf.txt', sep="\t", encoding='latin-1', dtype={'componentText': str})
data2 = pd.read_csv('data/2017secondhalf.txt', sep="\t", encoding='latin-1', dtype={'componentText': str})
data3 = pd.read_csv('data/2018.txt', sep="\t", encoding='latin-1', dtype={'componentText': str})

In [3]:
#transcript_train = pd.concat([data1, data2, data3])
transcript_train = data1

In [4]:
transcript_train.describe()

Unnamed: 0,objectId
count,146175.0
mean,3430898.0
std,16104790.0
min,18711.0
25%,91031.0
50%,180871.0
75%,305304.0
max,141885700.0


In [5]:
transcript_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146175 entries, 0 to 146174
Data columns (total 3 columns):
objectId                     146175 non-null int64
transcriptcreationdateUTC    146175 non-null object
componentText                146175 non-null object
dtypes: int64(1), object(2)
memory usage: 3.3+ MB


In [6]:
transcript_train.head()

Unnamed: 0,objectId,transcriptcreationdateUTC,componentText
0,106335,2017-01-04 15:40:41.000,Good morning. My name is Kalia and I will be y...
1,106335,2017-01-04 15:40:41.000,"Thank you, Kalia, and welcome, everyone, to Fo..."
2,106335,2017-01-04 15:40:41.000,"Thank you, Erich, and Happy New Year to everyo..."
3,106335,2017-01-04 15:40:41.000,"Great. Thanks, Mark. And welcome, everyone, to..."
4,106335,2017-01-04 15:40:41.000,"Thank you, Emily. And I'm just going to take a..."


In [7]:
transcript_train['transcriptcreationdateUTC'] = pd.to_datetime(transcript_train['transcriptcreationdateUTC'])

In [8]:
transcript_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146175 entries, 0 to 146174
Data columns (total 3 columns):
objectId                     146175 non-null int64
transcriptcreationdateUTC    146175 non-null datetime64[ns]
componentText                146175 non-null object
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 3.3+ MB


In [9]:
import spacy
from spacy import displacy

In [10]:
# pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz

In [11]:
import en_core_web_sm

In [12]:
nlp = en_core_web_sm.load()

In [13]:
#transcript_train['componentText'] = transcript_train['componentText'].to_string()

In [14]:
%%time
transcript_train['parsed_componentText'] = transcript_train.iloc[:,2].apply(lambda x: nlp(x))
#test = nlp(transcript_train.iloc[:,2])

CPU times: user 3h 23min 17s, sys: 1min 22s, total: 3h 24min 40s
Wall time: 43min 31s


In [15]:
transcript_train.iloc[:,3][0]

Good morning. My name is Kalia and I will be your conference operator today. At this time, I would like to welcome everyone to the Ford Monthly Sales Conference Call. [Operator Instructions] Thank you.   I would now like to turn the call over to our host, Erich Merkle, U.S. sales analyst. Please go ahead.

In [16]:
# lemmatization and removing stop words
transcript_train['parsed_componentText_lemma'] = transcript_train.iloc[:,3].apply(lambda text: 
                                          " ".join(token.lemma_ for token in text if not token.is_stop))

In [17]:
transcript_train.iloc[:,4][0]

'good morning . Kalia conference operator today . time , like welcome Ford Monthly Sales conference . [ Operator Instructions ] thank .    like turn host , Erich Merkle , U.S. sale analyst . ahead .'

In [18]:
# Adding customized stop words
# # New stop words list 
# customize_stop_words = [
#     'attach'
# ]

# # Mark them as stop words
# for w in customize_stop_words:
#     nlp.vocab[w].is_stop = True

In [19]:
transcript_train['month'] = transcript_train['transcriptcreationdateUTC'].apply(lambda x: x.month)

In [20]:
transcript_train1=transcript_train.groupby(['objectId','month'])['parsed_componentText_lemma'].apply(' '.join).reset_index()

In [21]:
transcript_train1

Unnamed: 0,objectId,month,parsed_componentText_lemma
0,18711,2,"good day , lady gentleman , welcome Allstate F..."
1,18711,5,"good day , lady gentleman , welcome Allstate Q..."
2,18749,2,"thank stand . good day , , welcome Amazon.com ..."
3,18749,4,"thank stand . good day , , welcome Amazon.com ..."
4,19049,1,"good day , , welcome Bank America Earnings Ann..."
...,...,...,...
418,141885706,1,"good morning , everybody . Chri Schott , pharm..."
419,141885706,3,"okay . good morning , welcome second day Barcl..."
420,141885706,4,"good morning , thank stand . welcome AbbVie Qu..."
421,141885706,5,"good morning , . welcome . go ahead start . Gr..."


In [22]:
# test = list(transcript_train1.iloc[:1,2].apply(lambda x: x.noun_chunks))
# Modeling bigrams and trigrams

In [23]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [24]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [25]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
# Gensim’s simple_preprocess() is great for this. Additionally I have set deacc=True to remove the punctuations.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

#data_words = list(sent_to_words(data))

In [27]:
transcript_train1['list_of_words'] = list(sent_to_words(transcript_train1['parsed_componentText_lemma']))

In [28]:
transcript_train1.head()

Unnamed: 0,objectId,month,parsed_componentText_lemma,list_of_words
0,18711,2,"good day , lady gentleman , welcome Allstate F...","[good, day, lady, gentleman, welcome, allstate..."
1,18711,5,"good day , lady gentleman , welcome Allstate Q...","[good, day, lady, gentleman, welcome, allstate..."
2,18749,2,"thank stand . good day , , welcome Amazon.com ...","[thank, stand, good, day, welcome, amazon, com..."
3,18749,4,"thank stand . good day , , welcome Amazon.com ...","[thank, stand, good, day, welcome, amazon, com..."
4,19049,1,"good day , , welcome Bank America Earnings Ann...","[good, day, welcome, bank, america, earnings, ..."


In [29]:
# Build the bigram and trigram models
transcript_train1['bigram'] = gensim.models.Phrases(transcript_train1['list_of_words'], min_count=5, threshold=10) # higher threshold fewer phrases.

# trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

In [None]:
%%time
# # Faster way to get a sentence clubbed as a trigram/bigram
transcript_train1['bigram_mod'] = transcript_train1['bigram'].apply(lambda x: gensim.models.phrases.Phraser(x))
# trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
transcript_train1['bigram_text'] = transcript_train1['bigram_mod'].apply(lambda x: x[transcript_train1['list_of_words']])

In [None]:
# Create Dictionary
# transcript_train1['id2word'] = transcript_train1['bigram_text'].apply(lambda x: corpora.Dictionary(x))


In [None]:
%%time
id2word = corpora.Dictionary(transcript_train1['bigram_text'][0])

In [None]:
corpus = [id2word.doc2bow(x) for x in transcript_train1['bigram_text'][0]]

In [None]:
texts = []
for i in range(len(transcript_train1['bigram_text'][0])):
    texts.append(transcript_train1['bigram_text'][0][i])
    
    

In [None]:
%%time
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=25,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

So how to infer pyLDAvis’s output?

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

We have successfully built a good looking topic model.

Given our prior knowledge of the number of natural topics in the document, finding the best model was fairly straightforward.

Upnext, we will improve upon this model by using Mallet’s version of LDA algorithm and then we will focus on how to arrive at the optimal number of topics given any large corpus of text.

In [None]:
# How to find the optimal number of topics for LDA?
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=30, step=2)

In [None]:
# Show graph
limit=10; start=2; step=2;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
#random cut forest to filter out outlier
# PCA
# classification/regression

#visualization
# word cloud, feature importance, S&P global leaflet, 