In [1]:
import os
import pandas as pd
import re
import nltk
from string import punctuation

In [2]:
import gensim
from gensim.utils import simple_preprocess

In [3]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_md')

In [6]:
import markdown
from bs4 import BeautifulSoup

In [9]:
description_file ='../training_corpus/extended_description.csv'

In [10]:
description_df = pd.read_csv(description_file)

In [11]:
description_df.columns

Index(['Unnamed: 0', 'URL', 'contributor', 'excerpt'], dtype='object')

In [12]:
description_df.drop(columns='Unnamed: 0',inplace=True,)

In [13]:
description_df.head()

Unnamed: 0,URL,contributor,excerpt
0,https://github.com/GoogleChrome/puppeteer,Allen Mao,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The Integral Regression is also known as soft-...


In [14]:
description_df.head(20)

Unnamed: 0,URL,contributor,excerpt
0,https://github.com/GoogleChrome/puppeteer,Allen Mao,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The Integral Regression is also known as soft-...
5,https://github.com/JimmySuen/integral-human-pose,Allen Mao,This is an official implementation for Integra...
6,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The original implementation is based on our in...
7,https://github.com/JuliaGeo/LibGEOS.jl,Allen Mao,LibGEOS is a LGPL-licensed package for manipul...
8,https://github.com/JuliaGeo/LibGEOS.jl,Allen Mao,"Among other things, it allows you to parse Wel..."
9,https://github.com/LMescheder/GAN_stability,Allen Mao,This repository contains the experiments in th...


In [15]:
df =description_df.groupby('URL').agg(''.join).reset_index()

In [16]:
df.head()

Unnamed: 0,URL,contributor,excerpt
0,https://github.com/CMU-Perceptual-Computing-La...,Ling Li,OpenPose represents the first real-time multi-...
1,https://github.com/Codecademy/EventHub,Yi XieYi XieYi XieYi Xie,EventHub enables companies to do cross device ...
2,https://github.com/GoogleChrome/puppeteer,Allen Mao,Puppeteer is a Node library which provides a h...
3,https://github.com/HumbleSoftware/envisionjs,Yi Xie,Fast interactive HTML5 charts.
4,https://github.com/JaidedAI/EasyOCR,Ling Li,Ready-to-use OCR with 70+ languages supported ...


#  preprocessing 

In [17]:
len(df)

105

In [18]:
df['excerpt'] = df['excerpt'].map(lambda x: re.sub(r'[\.\?\!,=]','',x))
# remove digits                                      
regex = re.compile(r'\d*')
df['excerpt'] = df['excerpt'].map(lambda x: re.sub(regex,'',x))
# convert to lower case
df['excerpt'] = df['excerpt'].map(lambda x: x.lower())                                

In [19]:
df.head()

Unnamed: 0,URL,contributor,excerpt
0,https://github.com/CMU-Perceptual-Computing-La...,Ling Li,openpose represents the first real-time multi-...
1,https://github.com/Codecademy/EventHub,Yi XieYi XieYi XieYi Xie,eventhub enables companies to do cross device ...
2,https://github.com/GoogleChrome/puppeteer,Allen Mao,puppeteer is a node library which provides a h...
3,https://github.com/HumbleSoftware/envisionjs,Yi Xie,fast interactive html charts
4,https://github.com/JaidedAI/EasyOCR,Ling Li,ready-to-use ocr with + languages supported in...


In [20]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(WordNetLemmatizer().lemmatize(plural)) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,dy
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [21]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [22]:
stop_words_new =['code','file','text','use','http','https','install','using','import','python','instal','example','documentation','rain'
                'contains','contain','data','chinese','korean','thai','japanese']

In [23]:
stop_words.extend(stop_words_new)

In [24]:
# from nltk.stem import WordNetLemmatizer
# wordnet_lemmatizer = WordNetLemmatizer()
   
    
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
            
            
            result.append(token)
                
    return result
            

In [25]:
processed_docs =df['excerpt'].map(preprocess)

In [26]:
processed_docs.head()

0    [openpose, represents, first, real, time, mult...
1    [eventhub, enables, companies, cross, device, ...
2    [puppeteer, node, library, provides, high, lev...
3                    [fast, interactive, html, charts]
4             [ready, languages, supported, including]
Name: excerpt, dtype: object

In [27]:
import gensim.corpora as corpora

In [28]:
 

# Create Dictionary
dictionary = corpora.Dictionary(processed_docs)
#dictionary.filter_extremes(no_below=2, no_above=0.8)

# Create Corpus
texts = processed_docs

# 
bow_corpus = [dictionary.doc2bow(text) for text in texts]



In [29]:
from gensim import corpora,models

In [30]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [31]:
len(corpus_tfidf)

105

In [32]:
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=6, 
                                           random_state=100,
                                          
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True,minimum_probability=0.001)


In [33]:
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=6, 
                                           random_state=100,
                                          
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True,minimum_probability=0.001)


In [37]:
lda_model_2.print_topics(num_words=30)

[(0,
  '0.002*"average" + 0.002*"resolution" + 0.001*"presented" + 0.001*"residual" + 0.001*"objects" + 0.001*"coordinate" + 0.001*"shapely" + 0.001*"latent" + 0.001*"training" + 0.001*"paper" + 0.001*"implementation" + 0.001*"repository" + 0.001*"geos" + 0.001*"bolt" + 0.001*"neoj" + 0.001*"tinkerpop" + 0.001*"project" + 0.001*"magenta" + 0.001*"fully" + 0.001*"models" + 0.001*"tested" + 0.001*"simplifies" + 0.001*"algorithms" + 0.001*"nextflow" + 0.001*"fastaifastai" + 0.001*"practices" + 0.001*"welcome" + 0.001*"pose" + 0.001*"eccv" + 0.001*"packages"'),
 (1,
  '0.003*"scikit" + 0.002*"image" + 0.002*"ready" + 0.002*"html" + 0.002*"interactive" + 0.002*"charts" + 0.002*"languages" + 0.002*"supported" + 0.002*"matrix" + 0.001*"network" + 0.001*"numpy" + 0.001*"tensor" + 0.001*"learning" + 0.001*"domain" + 0.001*"processing" + 0.001*"robot" + 0.001*"cable" + 0.001*"storage" + 0.001*"vega" + 0.001*"blog" + 0.001*"help" + 0.001*"wrapper" + 0.001*"fast" + 0.001*"tensorflow" + 0.001*"debu

In [53]:
lda_model_2.save('lda.model')

In [40]:
lda_model = gensim.models.ldamodel.LdaModel.load('lda.model')

In [48]:
dict(lda_model.id2word)

{0: 'body',
 1: 'detect',
 2: 'facial',
 3: 'first',
 4: 'foot',
 5: 'hand',
 6: 'human',
 7: 'images',
 8: 'jointly',
 9: 'keypoints',
 10: 'multi',
 11: 'openpose',
 12: 'person',
 13: 'real',
 14: 'represents',
 15: 'single',
 16: 'system',
 17: 'time',
 18: 'total',
 19: 'answer',
 20: 'associated',
 21: 'built',
 22: 'business',
 23: 'cohorted',
 24: 'common',
 25: 'companies',
 26: 'conversion',
 27: 'cross',
 28: 'dashboard',
 29: 'device',
 30: 'enables',
 31: 'event',
 32: 'eventhub',
 33: 'events',
 34: 'following',
 35: 'funnel',
 36: 'higher',
 37: 'joined',
 38: 'questionswhat',
 39: 'rate',
 40: 'ratewhat',
 41: 'retentionwhich',
 42: 'test',
 43: 'tracking',
 44: 'user',
 45: 'variant',
 46: 'visualized',
 47: 'chrome',
 48: 'chromium',
 49: 'configured',
 50: 'control',
 51: 'default',
 52: 'devtools',
 53: 'full',
 54: 'headless',
 55: 'high',
 56: 'level',
 57: 'library',
 58: 'node',
 59: 'protocol',
 60: 'provides',
 61: 'puppeteer',
 62: 'runs',
 63: 'charts',
 64:

In [38]:
print('nPerplexity Score: ', lda_model_1.log_perplexity(bow_corpus))
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_1, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)



nPerplexity Score:  -7.705747381699175
nCoherence Score:  0.3929166470668761


In [32]:
print('nPerplexity Score: ', lda_model_2.log_perplexity(corpus_tfidf))
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_2, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)



nPerplexity Score:  -10.687409430931293
nCoherence Score:  0.57964140653107


In [33]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_1, bow_corpus, dictionary,sort_topics=False)
LDAvis_prepared

NameError: name 'lda_model_1' is not defined

In [435]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_2, corpus_tfidf, dictionary,sort_topics=False)
LDAvis_prepared

In [437]:
for a,b in bow_corpus[0]:
    print(dictionary[a])

body
detect
facial
first
foot
hand
human
images
jointly
keypoints
multi
openpose
person
real
represents
single
system
time
total


In [451]:
text = 'this is a visualization software used to create charts'

In [452]:
preprocess(text)

['visualization', 'software', 'used', 'create', 'charts']

In [454]:

bow_vector = dictionary.doc2bow(preprocess(text))
for index, score in sorted(lda_model_2[bow_vector][0], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {},{}".format(score,index+1, lda_model_2.print_topic(index,30)))

Score: 0.9469016194343567	 Topic: 6,0.002*"network" + 0.002*"pytorch" + 0.002*"tile" + 0.002*"html" + 0.002*"interactive" + 0.002*"charts" + 0.002*"pose" + 0.002*"scikit" + 0.002*"tensorflow" + 0.001*"eccv" + 0.001*"method" + 0.001*"ruby" + 0.001*"wrapper" + 0.001*"tetgen" + 0.001*"parameter" + 0.001*"neural" + 0.001*"tensor" + 0.001*"wells" + 0.001*"lithology" + 0.001*"give" + 0.001*"pressure" + 0.001*"vega" + 0.001*"module" + 0.001*"regression" + 0.001*"numpy" + 0.001*"integral" + 0.001*"presented" + 0.001*"renderer" + 0.001*"fast" + 0.001*"able"
Score: 0.0122231999412179	 Topic: 2,0.002*"learning" + 0.002*"detection" + 0.002*"languages" + 0.002*"object" + 0.002*"jupyter" + 0.001*"framework" + 0.001*"powerful" + 0.001*"ready" + 0.001*"including" + 0.001*"segyio" + 0.001*"library" + 0.001*"research" + 0.001*"intuitive" + 0.001*"feature" + 0.001*"reinforcement" + 0.001*"accurate" + 0.001*"modern" + 0.001*"features" + 0.001*"biomedical" + 0.001*"mapshaper" + 0.001*"neural" + 0.001*"sonn

In [351]:
lda_model_1[bow_vector][0]

[(0, 0.02954516),
 (1, 0.027289057),
 (2, 0.8594328),
 (3, 0.023520524),
 (4, 0.024428165),
 (5, 0.0357843)]

In [334]:
for a,b in bow_vector:
    print(dictionary[a])

software
learning
machine


In [456]:
lda_model_1.get_document_topics(corpus_tfidif[0])

[(0, 0.0087373005),
 (1, 0.008933453),
 (2, 0.0071617523),
 (3, 0.9599697),
 (4, 0.0068779592),
 (5, 0.008319867)]

In [459]:
for a,b in corpus_tfidif[0]:
    print(dictionary[a])

body
detect
facial
first
foot
hand
human
images
jointly
keypoints
multi
openpose
person
real
represents
single
system
time
total


In [460]:
df['excerpt'][0]

'openpose represents the first real-time multi-person system to jointly detect human body hand facial and foot keypoints (in total  keypoints) on single images'

In [34]:
for t in topics:
    print(t)

NameError: name 'topics' is not defined

In [35]:
topics =lda_model_2.print_topics(-1,30)

In [38]:
for t in topics:
    print(t[0],t[1])

0 0.002*"average" + 0.002*"resolution" + 0.001*"presented" + 0.001*"residual" + 0.001*"objects" + 0.001*"coordinate" + 0.001*"shapely" + 0.001*"latent" + 0.001*"training" + 0.001*"paper" + 0.001*"implementation" + 0.001*"repository" + 0.001*"geos" + 0.001*"bolt" + 0.001*"neoj" + 0.001*"tinkerpop" + 0.001*"project" + 0.001*"magenta" + 0.001*"fully" + 0.001*"models" + 0.001*"tested" + 0.001*"simplifies" + 0.001*"algorithms" + 0.001*"nextflow" + 0.001*"fastaifastai" + 0.001*"practices" + 0.001*"welcome" + 0.001*"pose" + 0.001*"eccv" + 0.001*"packages"
1 0.003*"scikit" + 0.002*"image" + 0.002*"ready" + 0.002*"html" + 0.002*"interactive" + 0.002*"charts" + 0.002*"languages" + 0.002*"supported" + 0.002*"matrix" + 0.001*"network" + 0.001*"numpy" + 0.001*"tensor" + 0.001*"learning" + 0.001*"domain" + 0.001*"processing" + 0.001*"robot" + 0.001*"cable" + 0.001*"storage" + 0.001*"vega" + 0.001*"blog" + 0.001*"help" + 0.001*"wrapper" + 0.001*"fast" + 0.001*"tensorflow" + 0.001*"debugging" + 0.001*