In [1]:
import scholarly

In [2]:
results = scholarly.search_pubs_custom_url('/scholar?as_ylo=2019&q="supply+chain"&hl=en&as_sdt=1,1&as_vis=1')

In [3]:
for result in results:
    print(result)
    break

{'_filled': False,
 'bib': {'abstract': 'We opened our 2010 paper in the Journal of Business '
                     'Logistics with a 6th century quote by Heraclitus–“The '
                     'only constant is change.” This immutable law certainly '
                     "holds in today's volatile business world, especially for "
                     'supply chain management, and has been the driving …',
         'author': 'TJ Pettit and KL Croxton and J Fiksel',
         'eprint': 'https://onlinelibrary.wiley.com/doi/pdf/10.1111/jbl.12202',
         'title': 'The Evolution of Resilience in Supply Chain Management: A '
                  'Retrospective on Ensuring Supply Chain Resilience',
         'url': 'https://onlinelibrary.wiley.com/doi/abs/10.1111/jbl.12202'},
 'citedby': 1,
 'id_scholarcitedby': '16408947525918857329',
 'source': 'scholar',
 'url_scholarbib': 'https://scholar.googleusercontent.com/scholar.bib?q=info:cRhrJNZKuOMJ:scholar.google.com/&output=citation&scisdr=CgUD3q0

In [4]:
num_scraped = 0
title_list = []

for result in results:
    num_scraped += 1
    
    title_list.append(result.bib['title'])
    
    if num_scraped == 5000:
        break

In [5]:
title_list[2]

'An empirical analysis of supply chain finance adoption'

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models.phrases import Phrases
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords

In [7]:
example = title_list[0]

In [8]:
example

'A behavioral investigation of supply chain contracts for a newsvendor problem in a developing economy'

In [9]:
simple_preprocess(example)

['behavioral',
 'investigation',
 'of',
 'supply',
 'chain',
 'contracts',
 'for',
 'newsvendor',
 'problem',
 'in',
 'developing',
 'economy']

In [10]:
corpus = pd.DataFrame(columns=['raw_text', 'processed_text'])

In [11]:
for i, title in enumerate(title_list):
    
    corpus.loc[i, 'raw_text'] = title
    
    no_stop_title = remove_stopwords(title)
    
    processed = simple_preprocess(no_stop_title)
    
    corpus.loc[i, 'processed_text'] = processed

In [12]:
corpus.loc[0, 'processed_text']

['behavioral',
 'investigation',
 'supply',
 'chain',
 'contracts',
 'newsvendor',
 'problem',
 'developing',
 'economy']

In [13]:
bigrammer = Phrases(corpus['processed_text'], threshold=5)

In [14]:
bigrammer[corpus.loc[7, 'processed_text']]

['blockchain_technology',
 'relationships',
 'sustainable',
 'supply_chain',
 'management']

In [15]:
corpus['bigram_proc_text'] = [[0]]*len(corpus)

for i in range(len(corpus)):
    
    corpus.loc[i, 'bigram_proc_text'] = bigrammer[corpus.loc[i, 'processed_text']]

In [16]:
teststr = ' hi my name is jake'

In [17]:
teststr[1:]

'hi my name is jake'

In [18]:
new_corpus = []

for doc in corpus['bigram_proc_text']:
    
    build_str = ''
    
    for xstr in doc:
        
        build_str = build_str + ' ' + xstr
        
    build_str = build_str[1:]
    
    new_corpus.append(build_str)

In [19]:
vectorizer = TfidfVectorizer(lowercase=False)

In [20]:
vectorized = vectorizer.fit_transform(new_corpus)

In [21]:
from sklearn.cluster import KMeans

In [22]:
vectorized[0]

<1x2036 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [69]:
kmeans = KMeans(n_clusters=50).fit(vectorized)

In [70]:
kmeans.cluster_centers_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [71]:
from sklearn.metrics import pairwise_distances_argmin_min

In [72]:
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, vectorized)

In [73]:
closest

array([581, 607, 627, 466, 235, 123, 587, 311, 541, 831, 634, 746, 122,
       395, 263, 309, 740, 147, 719, 138, 631, 683, 407, 832, 432, 149,
       480, 530, 742, 756, 625, 226, 167,  76, 682, 844, 916, 290, 103,
       228, 729,  23, 305, 315, 860, 566, 361,  39, 154, 246])

In [74]:
corpus.loc[closest[0], 'raw_text']

'Identifying trade-offs between sustainability dimensions in the supply chain of biodiesel in Colombia'

In [75]:
corpus.loc[closest[1], 'raw_text']

"Supply chain coordination to optimize manufacturer's capacity procurement decisions through a new commitment-based model with penalty and revenue …"

In [76]:
corpus.loc[closest[2], 'raw_text']

'Architectures for Green-Field Supply Chain Integration-Supply Chain Integration Design'

In [77]:
corpus.loc[closest[3], 'raw_text']

'Optimal planning of municipal solid waste management systems in an integrated supply chain network'

In [78]:
corpus['kmeans_pred'] = kmeans.predict(vectorized)

In [79]:
corpus['kmeans_pred'].value_counts()

45    75
25    46
6     43
34    31
22    31
24    28
29    27
35    27
7     26
19    26
46    26
20    25
16    24
23    23
4     22
3     22
14    22
2     21
49    21
15    20
37    20
40    20
32    19
48    18
8     18
31    17
11    17
21    17
13    17
44    16
27    15
42    15
17    15
30    14
28    14
26    13
43    13
1     13
39    12
18    12
41    11
5     11
47    11
38    10
9      9
33     9
12     8
10     8
36     7
0      3
Name: kmeans_pred, dtype: int64

In [83]:
corpus[corpus['kmeans_pred']==6].head(10)

Unnamed: 0,raw_text,processed_text,bigram_proc_text,kmeans_pred
5,Supply chain management in industrial marketin...,"[supply, chain, management, industrial, market...","[supply_chain, management, industrial, marketi...",6
10,Resilience of medium-sized firms to supply cha...,"[resilience, medium, sized, firms, supply, cha...","[resilience, medium, sized, firms, supply_chai...",6
28,Competition policy and antitrust law: implicat...,"[competition, policy, antitrust, law, implicat...","[competition, policy, antitrust, law, implicat...",6
51,The Promise: Signaling Sustainability in Suppl...,"[the, promise, signaling, sustainability, supp...","[the, promise, signaling, sustainability, supp...",6
98,Reporting on supply chain sustainability: Meas...,"[reporting, supply, chain, sustainability, mea...","[reporting, supply_chain, sustainability, meas...",6
134,Supply Chain Linked Sustainability Assessment ...,"[supply, chain, linked, sustainability, assess...","[supply_chain, linked, sustainability, assessm...",6
143,Does social capital matter for supply chain re...,"[does, social, capital, matter, supply, chain,...","[does, social, capital, matter, supply_chain, ...",6
144,Supply chain sustainability risk and assessment,"[supply, chain, sustainability, risk, assessment]","[supply_chain, sustainability, risk_assessment]",6
152,Information sharing and the impact of shutdown...,"[information, sharing, impact, shutdown, polic...","[information_sharing, impact, shutdown, policy...",6
222,"Exploring the Social, Economic and Environment...","[exploring, social, economic, environmental, f...","[exploring, social, economic, environmental, f...",6


## Do LDA with LDA vis for a little more intuition

In [32]:
from sklearn.decomposition import LatentDirichletAllocation

In [57]:
lda = LatentDirichletAllocation(n_components=5)

In [58]:
lda.fit(vectorized)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [59]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [60]:
pyLDAvis.sklearn.prepare(lda, vectorized, vectorizer)