# Load Data

In [1]:
import os
import pandas as pd
import re
import string
import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
os.getcwd()
root_path = 'C:\\Users\\luoyan011\\Desktop\\PersonalLearning\\GitHub\\private_task\\coursera_nlp\\data'

In [2]:
raw_data = pd.read_csv(os.path.join(root_path, "Course_Courseoverviews.csv"))

In [3]:
data = raw_data[raw_data.short_description.isnull()==False]
data['doc'] = data[['display_name', 'short_description']].agg('. '.join, axis=1)
data['base_course_id'] = data['course_id'].apply(lambda x: x.split('+')[1])
data = data.drop(columns = ['course_id', 'last_modified'])
data = data.drop_duplicates()
data['index'] = data.index

In [4]:
data.head(5)

Unnamed: 0,display_name,short_description,doc,base_course_id,index
0,Mathematical Optimization for Business Problems,This course provides the necessary fundamental...,Mathematical Optimization for Business Problem...,CP0101EN,0
1,Accelerating Deep Learning with GPU,Majority of data in the world are unlabeled an...,Accelerating Deep Learning with GPU. Majority ...,ML0122ENv1,1
2,Controlling Hadoop Jobs Using Oozie,This short description is not used. The descri...,Controlling Hadoop Jobs Using Oozie. This shor...,BD0133EN,2
3,"Robots Are Coming! Build IoT Apps with Watson,...",Have fun with IoT and learn along the way. If ...,"Robots Are Coming! Build IoT Apps with Watson,...",ML0201EN,3
4,Deep Learning with TensorFlow - Beta,Majority of data in the world are unlabeled an...,Deep Learning with TensorFlow - Beta. Majority...,ML0121EN,4


# Generate tokens manually

```python

```

In [29]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
porter = PorterStemmer()

def find_tokens(example, ngram, stem, tag_drop): #lemmatization to be done
    # tag_drp: J--objective, N--noun, V--verb, R--adv
    example = example.lower()
    example = example.translate(str.maketrans('', '',string.punctuation))
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [] 
    tmp_sentence = []
    word_tokens = [t[0] for t in pos_tag(word_tokenize(example)) if t[1][0] not in tag_drop]
    if stem == True:
        if ngram == 1:        
            for w in word_tokens: 
                if w not in stop_words:
                    filtered_sentence.append(porter.stem(w))
        else:
            for w in word_tokens: 
                if w not in stop_words: 
                    tmp_sentence.append(porter.stem(w))
            n_grams = ngrams(tmp_sentence,ngram)
            for grams in n_grams:
                filtered_sentence.append(" ".join(grams))
    if stem == False:
        if ngram == 1:        
            for w in word_tokens: 
                if w not in stop_words: 
                    filtered_sentence.append(w)
        else:
            for w in word_tokens: 
                if w not in stop_words: 
                    tmp_sentence.append(w)
            n_grams = ngrams(tmp_sentence,ngram)
            for grams in n_grams:
                filtered_sentence.append(" ".join(grams))        
    return filtered_sentence


def find_multiple_gram_tokens(examples, multi_gram, stem, tag_drop):
    multi_tokens = []
    for i in multi_gram:
        multi_tmp_tokens = find_tokens(examples, i, stem, tag_drop)
        multi_tokens.extend(multi_tmp_tokens)
    return multi_tokens

def get_vocab(tokens):
    vocab = []
    total_words = 0
    for token in tokens:
        total_words = total_words + len(token)
        for i in range(len(token)):
            if token[i] not in vocab:
                vocab.append(token[i])
    return vocab



In [41]:
data['doc_tokens'] = data['doc'].apply(lambda x: find_tokens(x.lower(), 1, stem = False, tag_drop=['J']))
data.doc_tokens[0]
#get_vocab(data.doc_tokens)

['optimization',
 'business',
 'problems',
 'course',
 'provides',
 'fundamentals',
 'programming']

In [5]:
data['doc_tokens'] = data['doc'].apply(lambda x: find_tokens(x.lower(), 1, stem = False, tag_drop=['J']))
data['title_tokens'] = data['display_name'].apply(lambda x: find_tokens(x.lower(), 1, stem = False, tag_drop=['J']))
data['description_tokens'] = data['short_description'].apply(lambda x: find_tokens(x.lower(), 1, stem = False, tag_drop=['J']))
testtoken = get_vocab(data.doc_tokens.to_list())

In [6]:
data['tokens'] = data['doc'].apply(lambda x: find_tokens(x.lower(),2, stem = False, tag_drop=['J']))

In [7]:
data['tokens'][0]

['optimization business',
 'business problems',
 'problems course',
 'course provides',
 'provides fundamentals',
 'fundamentals programming']

# Generate tokens using function from sklearn

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
def generate_bw_data(data, ngram_range):
    cv = CountVectorizer(stop_words = 'english', lowercase = True, ngram_range = ngram_range)
    word_count = cv.fit_transform(data)
    out = pd.DataFrame(word_count.toarray(), columns = cv.get_feature_names())
    return out

In [9]:
bw1_df = generate_bw_data(data.doc, (1,1))
bw2_df = generate_bw_data(data.doc, (2,2))

# Fit Model using gensim

In [46]:
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import TfidfModel
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from functools import partial

### LDA

In [11]:
processed_docs = data['doc'].map(partial(find_multiple_gram_tokens, multi_gram = [1,2], stem = False, tag_drop=['J']))
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=100000) #filter out tokens that appears in less than 3 docs and more than 0.5 documents, keep only the first 100000 most frequent tokens
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = gensim.models.LdaMulticore(corpus = bow_corpus, 
                                       num_topics = 3, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers = 2, 
                                       random_state = 100)

In [12]:
def print_bow_example(text, dictionary):
    for i in range(len(text)):
        print("Word {} (\"{}\") appears {} time.".format(text[i][0], dictionary[text[i][0]], text[i][1]))
print_bow_example(bow_corpus[0], dictionary)

Word 0 ("business") appears 1 time.
Word 1 ("business problems") appears 1 time.
Word 2 ("fundamentals") appears 1 time.
Word 3 ("problems") appears 1 time.
Word 4 ("programming") appears 1 time.
Word 5 ("provides") appears 1 time.


In [13]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"learn" + 0.016*"presents" + 0.016*"answers" + 0.016*"approach" + 0.016*"presents approach" + 0.016*"course presents" + 0.016*"answers questions" + 0.016*"questions" + 0.015*"use" + 0.015*"matters"
Topic: 1 
Words: 0.033*"learning" + 0.030*"analytics" + 0.026*"learn" + 0.024*"machine" + 0.022*"machine learning" + 0.021*"using" + 0.016*"watson" + 0.012*"text" + 0.012*"apache" + 0.011*"analysis"
Topic: 2 
Words: 0.025*"used" + 0.022*"description" + 0.020*"science" + 0.018*"data science" + 0.015*"learning" + 0.013*"taken" + 0.012*"spark" + 0.012*"provide" + 0.011*"one" + 0.011*"course image"


In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)


In [15]:
warnings.filterwarnings('ignore')
vis

  and should_run_async(code)


### TFIDF

In [44]:
def get_tfidf_dataframe(data, multi_gram, stem, tag_drop):
#     data: the document column in dataset to be calculate tfidf, eg data['doc']
#     multi_gram: n-gram, eg[1,2]
#     stem: whether we treat words stem: True or False
    processed_docs = data.map(partial(find_multiple_gram_tokens, multi_gram = multi_gram, stem = stem, tag_drop=tag_drop))
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=100000) #filter out tokens that appears in less than 3 docs and more than 0.5 documents, keep only the first 100000 most frequent tokens
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    tfidf = TfidfModel(bow_corpus)
    vector = tfidf[bow_corpus]
    doc_id = []
    bow = []
    tfidf_value = []
    for index, doc in enumerate(vector):
        for id, value in doc:
            doc_id.append(index)
            bow.append(dictionary.get(id))
            tfidf_value.append(value)
    data = {"doc_id": doc_id, "bow": bow, "tfidf_value": tfidf_value}
    data = pd.DataFrame(data)
    return(data)
def get_top_n_tfidf_bow(data, multi_gram, stem, tag_drop, n):
#     data: the document column in dataset to be calculate tfidf, e.g. data['doc']
#     multi_gram: n-gram, e.g.[1,2]
#     stem: whether we treat words stem: True or False
#     n: top n bag of word we want: e.g. 30
    tmp_data = get_tfidf_dataframe(data, multi_gram, stem, tag_drop)
    tfidf_max_value = tmp_data.groupby('bow')['tfidf_value'].nlargest(1).reset_index(drop = False).sort_values(by = 'tfidf_value', ascending = False)
    output_list = tfidf_max_value.head(n).bow.tolist()
    return output_list

In [17]:
tfidf_value_data = get_tfidf_dataframe(data = data['doc'], multi_gram = [1,2], stem = False, tag_drop = ['J'])
to30_tfidf_bow = get_top_n_tfidf_bow(data = data['doc'], multi_gram = [1,2], stem = False, tag_drop = ['J'], n = 30)
to50_tfidf_bow = get_top_n_tfidf_bow(data = data['doc'], multi_gram = [1,2], stem = False, tag_drop = ['J'], n = 50)

In [18]:
to30_tfidf_bow

['sql',
 'statistics',
 'computing',
 'scala',
 'welcome',
 'r',
 'hadoop',
 'microservices',
 'python',
 'cloud',
 'containers',
 'pipelines',
 'blockchain',
 'kubernetes',
 'projects',
 'spark',
 'search',
 'analytics 101',
 'tools',
 'server',
 'swift',
 'ai',
 'analytics',
 'functions',
 'end',
 'science bootcamp',
 'bootcamp',
 'web',
 'apache',
 'science']

### Customized LDA

In [103]:
def keep_specific_tokens(examples, multi_gram, stem, tag_drop, selected_tokens):
    tmp_tokens = find_multiple_gram_tokens(examples, multi_gram, stem, tag_drop)
    tmp_tokens = [i for i in tmp_tokens if i in selected_tokens]
    return tmp_tokens

def fit_lda(data, multi_gram, stem, tag_drop, top_n_tokens, num_topics):
    if top_n_tokens == '':
        processed_docs = data.map(partial(find_multiple_gram_tokens, multi_gram = multi_gram, stem = stem, tag_drop = tag_drop))
    else:
        selected_tokens = get_top_n_tfidf_bow(data = data, multi_gram = multi_gram, stem = stem, tag_drop = tag_drop, n = top_n_tokens)
        processed_docs = data.map(partial(keep_specific_tokens, multi_gram = multi_gram, stem = stem, tag_drop = tag_drop, selected_tokens = selected_tokens))
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=100000) #filter out tokens that appears in less than 3 docs and more than 0.5 documents, keep only the first 100000 most frequent tokens
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = gensim.models.LdaMulticore(corpus = bow_corpus, 
                                       num_topics = num_topics, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers = 2, 
                                       random_state = 100)
    return lda_model

In [47]:
lda_allbow = fit_lda(data['doc'], [1,2], False, ['J'], '', 5)
lda_top30bow = fit_lda(data['doc'], [1,2], False, ['J'], 30, 5)
lda_top50bow = fit_lda(data['doc'], [1,2], False, ['J'], 30, 5)

In [48]:
for idx, topic in lda_top30bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.290*"analytics" + 0.147*"cloud" + 0.112*"blockchain" + 0.068*"business" + 0.063*"analytics 101" + 0.051*"computing" + 0.051*"ai" + 0.050*"microservices" + 0.043*"functions" + 0.018*"statistics"
Topic: 1 
Words: 0.208*"python" + 0.195*"hadoop" + 0.145*"statistics" + 0.088*"bootcamp" + 0.088*"science bootcamp" + 0.087*"containers" + 0.025*"kubernetes" + 0.025*"projects" + 0.018*"welcome" + 0.018*"computing"
Topic: 2 
Words: 0.219*"kubernetes" + 0.170*"cloud" + 0.108*"swift" + 0.101*"web" + 0.073*"microservices" + 0.073*"ai" + 0.055*"projects" + 0.041*"server" + 0.037*"containers" + 0.021*"apache"
Topic: 3 
Words: 0.290*"sql" + 0.144*"tools" + 0.098*"scala" + 0.096*"end" + 0.080*"computing" + 0.042*"hadoop" + 0.040*"welcome" + 0.022*"server" + 0.018*"r" + 0.015*"business"
Topic: 4 
Words: 0.232*"spark" + 0.214*"r" + 0.151*"apache" + 0.087*"search" + 0.060*"analytics" + 0.039*"pipelines" + 0.035*"hadoop" + 0.026*"scala" + 0.023*"tools" + 0.019*"statistics"


In [102]:
topics=[]
for idx, topic in lda_top30bow.print_topics(-1):
    topics.append(topic)
toptopic = pd.DataFrame({'topics':topics}) #
toptopic[['var1','var2','var3','var4','var5', 'var6', 'var7', 'var8', 'var9', 'var10']] = toptopic.topics.str.split("+", expand=True)
toptopic = toptopic.drop(columns=['topics'])
toptopic = toptopic.applymap(lambda x: re.search('"(.*)"', x).group(1))
toptopic.insert(0, 'Topics', toptopic.index+1)
toptopic


Unnamed: 0,Topics,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10
0,1,analytics,cloud,blockchain,business,analytics 101,computing,ai,microservices,functions,statistics
1,2,python,hadoop,statistics,bootcamp,science bootcamp,containers,kubernetes,projects,welcome,computing
2,3,kubernetes,cloud,swift,web,microservices,ai,projects,server,containers,apache
3,4,sql,tools,scala,end,computing,hadoop,welcome,server,r,business
4,5,spark,r,apache,search,analytics,pipelines,hadoop,scala,tools,statistics


In [22]:
for idx, topic in lda_top50bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.165*"analytics" + 0.163*"r" + 0.132*"cloud" + 0.063*"computing" + 0.058*"blockchain" + 0.056*"kubernetes" + 0.054*"ai" + 0.050*"microservices" + 0.048*"containers" + 0.046*"swift"
Topic: 1 
Words: 0.244*"science" + 0.150*"spark" + 0.111*"python" + 0.086*"statistics" + 0.080*"tools" + 0.060*"scala" + 0.054*"science bootcamp" + 0.053*"bootcamp" + 0.033*"analytics" + 0.023*"apache"
Topic: 2 
Words: 0.168*"hadoop" + 0.148*"apache" + 0.137*"sql" + 0.088*"search" + 0.057*"spark" + 0.056*"kubernetes" + 0.052*"web" + 0.045*"projects" + 0.037*"server" + 0.036*"end"


In [23]:
for idx, topic in lda_allbow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"learn" + 0.016*"presents" + 0.016*"answers" + 0.016*"approach" + 0.016*"presents approach" + 0.016*"course presents" + 0.016*"answers questions" + 0.016*"questions" + 0.015*"use" + 0.015*"matters"
Topic: 1 
Words: 0.033*"learning" + 0.030*"analytics" + 0.026*"learn" + 0.024*"machine" + 0.022*"machine learning" + 0.021*"using" + 0.016*"watson" + 0.012*"text" + 0.012*"apache" + 0.011*"analysis"
Topic: 2 
Words: 0.025*"used" + 0.022*"description" + 0.020*"science" + 0.018*"data science" + 0.015*"learning" + 0.013*"taken" + 0.012*"spark" + 0.012*"provide" + 0.011*"one" + 0.011*"course image"


In [24]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_allbow, bow_corpus, dictionary)
vis