In [75]:
import os
import pandas as pd
import re
import nltk
from string import punctuation

In [71]:
import gensim
from gensim.utils import simple_preprocess

In [72]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [73]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_md')

In [6]:
import markdown
from bs4 import BeautifulSoup

In [8]:
description_file ='./training_corpus/extended_description.csv'

In [9]:
description_df = pd.read_csv(description_file)

In [13]:
description_df.columns

Index(['Unnamed: 0', 'URL', 'contributor', 'excerpt'], dtype='object')

In [16]:
description_df.drop(columns='Unnamed: 0',inplace=True,)

In [17]:
description_df.head()

Unnamed: 0,URL,contributor,excerpt
0,https://github.com/GoogleChrome/puppeteer,Allen Mao,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Allen Mao,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,Allen Mao,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,Allen Mao,The Integral Regression is also known as soft-...


In [34]:
description_df.head(20)

Unnamed: 0,URL,excerpt
0,https://github.com/GoogleChrome/puppeteer,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,The Integral Regression is also known as soft-...
5,https://github.com/JimmySuen/integral-human-pose,This is an official implementation for Integra...
6,https://github.com/JimmySuen/integral-human-pose,The original implementation is based on our in...
7,https://github.com/JuliaGeo/LibGEOS.jl,LibGEOS is a LGPL-licensed package for manipul...
8,https://github.com/JuliaGeo/LibGEOS.jl,"Among other things, it allows you to parse Wel..."
9,https://github.com/LMescheder/GAN_stability,This repository contains the experiments in th...


In [53]:
df =description_df.groupby('URL').agg(''.join).reset_index()

In [151]:
df.hea

Unnamed: 0,URL,excerpt
0,https://github.com/CMU-Perceptual-Computing-La...,openpose represents the first real-time multi-...
1,https://github.com/Codecademy/EventHub,eventhub enables companies to do cross device ...
2,https://github.com/GoogleChrome/puppeteer,puppeteer is a node library which provides a h...
3,https://github.com/HumbleSoftware/envisionjs,fast interactive html charts
4,https://github.com/JaidedAI/EasyOCR,ready-to-use ocr with + languages supported in...


#  preprocessing 

In [150]:
len(df)

105

In [59]:
df['excerpt'] = df['excerpt'].map(lambda x: re.sub(r'[\.\?\!,=]','',x))
# remove digits                                      
regex = re.compile(r'\d*')
df['excerpt'] = df['excerpt'].map(lambda x: re.sub(regex,'',x))
# convert to lower case
df['excerpt'] = df['excerpt'].map(lambda x: x.lower())                                

In [232]:
df.head()

Unnamed: 0,URL,excerpt
0,https://github.com/CMU-Perceptual-Computing-La...,openpose represents the first real-time multi-...
1,https://github.com/Codecademy/EventHub,eventhub enables companies to do cross device ...
2,https://github.com/GoogleChrome/puppeteer,puppeteer is a node library which provides a h...
3,https://github.com/HumbleSoftware/envisionjs,fast interactive html charts
4,https://github.com/JaidedAI/EasyOCR,ready-to-use ocr with + languages supported in...


In [623]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(WordNetLemmatizer().lemmatize(plural)) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,dy
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [67]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [198]:
stop_words_new =['code','file','text','use','http','https','install','using','import','python','instal','example','documentation','rain']

In [199]:
stop_words.extend(stop_words_new)

In [233]:
# from nltk.stem import WordNetLemmatizer
# wordnet_lemmatizer = WordNetLemmatizer()
   
    
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
            
            
            result.append(token)
                
    return result
            

In [234]:
processed_docs =df['excerpt'].map(preprocess)

In [235]:
processed_docs.head()

0    [openpose, represents, first, real, time, mult...
1    [eventhub, enables, companies, cross, device, ...
2    [puppeteer, node, library, provides, high, lev...
3                    [fast, interactive, html, charts]
4    [ready, languages, supported, including, chine...
Name: excerpt, dtype: object

In [236]:
import gensim.corpora as corpora

In [296]:
 

# Create Dictionary
dictionary = corpora.Dictionary(processed_docs)
#dictionary.filter_extremes(no_below=2, no_above=0.8)

# Create Corpus
texts = processed_docs

# 
bow_corpus = [dictionary.doc2bow(text) for text in texts]



In [278]:
from gensim import corpora,models

In [297]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [298]:
len(corpus_tfidf)

105

In [307]:
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=5, 
                                           random_state=100,
                                          
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True,minimum_probability=0.001)


In [312]:
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                          
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True,minimum_probability=0.001)


In [308]:
lda_model_1.print_topics(num_words=30)

[(0,
  '0.014*"data" + 0.007*"model" + 0.007*"geological" + 0.006*"files" + 0.005*"based" + 0.005*"support" + 0.005*"also" + 0.004*"analysis" + 0.004*"models" + 0.004*"gdal" + 0.004*"real" + 0.004*"docker" + 0.004*"gempy" + 0.004*"software" + 0.004*"read" + 0.004*"allows" + 0.004*"image" + 0.004*"visualization" + 0.004*"surface" + 0.004*"results" + 0.004*"time" + 0.003*"average" + 0.003*"running" + 0.003*"library" + 0.003*"information" + 0.003*"binary" + 0.003*"used" + 0.003*"generate" + 0.003*"versions" + 0.003*"queries"'),
 (1,
  '0.017*"data" + 0.008*"network" + 0.006*"based" + 0.006*"features" + 0.006*"spatial" + 0.006*"library" + 0.005*"also" + 0.004*"information" + 0.004*"tile" + 0.004*"method" + 0.004*"segyio" + 0.004*"react" + 0.004*"state" + 0.004*"different" + 0.004*"propose" + 0.004*"supports" + 0.003*"level" + 0.003*"make" + 0.003*"density" + 0.003*"layers" + 0.003*"proposed" + 0.003*"convolutional" + 0.003*"files" + 0.003*"deep" + 0.003*"well" + 0.003*"image" + 0.003*"neur

In [302]:
lda_model_2.print_topics(num_words=30)

[(0,
  '0.002*"geos" + 0.002*"languages" + 0.002*"gitbucket" + 0.001*"data" + 0.001*"gdal" + 0.001*"files" + 0.001*"segyio" + 0.001*"keypoints" + 0.001*"manipulation" + 0.001*"classical" + 0.001*"modern" + 0.001*"biomedical" + 0.001*"vega" + 0.001*"language" + 0.001*"root" + 0.001*"licensed" + 0.001*"engine" + 0.001*"postgis" + 0.001*"ported" + 0.001*"korean" + 0.001*"chinese" + 0.001*"thai" + 0.001*"japanese" + 0.001*"branch" + 0.001*"accurate" + 0.001*"fully" + 0.001*"planar" + 0.001*"geometric" + 0.001*"welcome" + 0.001*"practices"'),
 (1,
  '0.002*"react" + 0.002*"data" + 0.002*"tile" + 0.002*"spatial" + 0.001*"framework" + 0.001*"charts" + 0.001*"parameter" + 0.001*"also" + 0.001*"visualization" + 0.001*"hyvr" + 0.001*"reduxperformant" + 0.001*"interactive" + 0.001*"ground" + 0.001*"user" + 0.001*"level" + 0.001*"html" + 0.001*"designed" + 0.001*"geological" + 0.001*"fast" + 0.001*"easier" + 0.001*"hardware" + 0.001*"radar" + 0.001*"bindings" + 0.001*"pressure" + 0.001*"human" + 0

In [303]:
print('nPerplexity Score: ', lda_model_1.log_perplexity(bow_corpus))
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_1, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)



nPerplexity Score:  -7.6637430817938705
nCoherence Score:  0.38003359517988994


In [313]:
print('nPerplexity Score: ', lda_model_2.log_perplexity(corpus_tfidf))
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model_2, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('nCoherence Score: ', coherence_lda)



nPerplexity Score:  -9.79229996545798
nCoherence Score:  0.5674436619043413


In [309]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_1, bow_corpus, dictionary,sort_topics=False)
LDAvis_prepared

In [314]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_2, corpus_tfidf, dictionary,sort_topics=False)
LDAvis_prepared

In [134]:
df['excerpt'][2]

'puppeteer is a node library which provides a high-level api to control chrome or chromium over the devtools protocol puppeteer runs headless by default but can be configured to run full (non-headless) chrome or chromium'

In [766]:
for a,b in bow_corpus[0]:
    print(dictionary[a])

advantage
afterward
alabama
also
alternative
anaconda
archive
automatically
case
change
changes
click
clone
comands
command
commands
computer
condaio
could
desktop
developed
directly
directory
download
downloaded
earlier
easily
either
email
enabled
enter
example
extract
field
files
finished
first
fixed
folder
follow
following
forget
free
generated
getting
github
githubcom
gprpy
gprpygit
gprpysoftware
ground
hear
help
initialize
installation
installed
instead
instructions
interface
issue
january
june
latter
like
linux
marcus
marks
master
masterzip
minicondahtml
mode
multiples
myscriptnamepy
name
news
nsgeophysics
onto
open
origin
pacheco
penetrating
period
picking
please
pointing
points
post
press
processing
profile
programs
prompt
pull
quotation
radar
recent
regular
replace
running
save
scmcom
script
scripts
search
select
send
several
shown
simplemost
simply
small
software
somewhere
source
start
switch
system
take
terminal
thanks
time
trouble
troubles
tweets
twitter
type
uninstall
unin

In [292]:
text = 'this is a visualization software'

In [293]:
preprocess(new_doc)

['visualization', 'software']

In [294]:

bow_vector = dictionary.doc2bow(preprocess(text))
for index, score in sorted(lda_model_1[bow_vector][0], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, index))

Score: 0.9562720656394958	 Topic: 2
Score: 0.022907469421625137	 Topic: 0
Score: 0.02082052081823349	 Topic: 1


In [117]:
lda_model_1[bow_vector][0]

[(0, 0.020285627), (1, 0.025061116), (2, 0.9546533)]

In [127]:
bow_corpus[43]

[(10, 1),
 (79, 1),
 (97, 1),
 (124, 1),
 (164, 1),
 (184, 1),
 (188, 1),
 (221, 2),
 (243, 1),
 (247, 1),
 (356, 1),
 (383, 1),
 (421, 2),
 (444, 3),
 (447, 1),
 (461, 1),
 (475, 2),
 (477, 1),
 (494, 1),
 (507, 1),
 (565, 1),
 (637, 1),
 (650, 1),
 (887, 1),
 (1292, 3),
 (1298, 1),
 (1299, 1),
 (1300, 1),
 (1301, 1),
 (1302, 2),
 (1303, 1),
 (1304, 1),
 (1305, 1),
 (1306, 1),
 (1307, 1),
 (1308, 1),
 (1309, 1),
 (1310, 1),
 (1311, 1),
 (1312, 1),
 (1313, 1),
 (1314, 1),
 (1315, 1),
 (1316, 1),
 (1317, 1),
 (1318, 1),
 (1319, 1),
 (1320, 1),
 (1321, 3),
 (1322, 1)]