In [2]:
import os
import json
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [24]:
processedDircPath=r'C:\Users\admin\OneDrive\Documents\supervised project\processedFiles'

rawDataFile=open(os.path.join(processedDircPath,'rawDataList.txt'), "r" )
rawDataList=json.loads(rawDataFile.read())

cleanedDataFile=open(os.path.join(processedDircPath,'cleanedDataList.txt'), "r" )
cleanedDataList=json.loads(cleanedDataFile.read())

dataList=rawDataList

In [8]:
n_samples = 2000
n_features = 50
n_components = 4
n_top_words = 15

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f'Topic #{topic_idx}: '
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [19]:
# Use tf-idf features for NMF(Probabilistic Latent Semantic Indexing).
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidfMatrix = tfidf_vectorizer.fit_transform(dataList)

In [21]:
# Use tf (raw term count) features for LDA.
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tfMatrix = tf_vectorizer.fit_transform(dataList)

In [25]:
# Fit the NMF model ( Probabilistic Latent Semantic Indexing) with tfidf
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidfMatrix)

tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #0: al et 2017 model technology 2016 study 2015 service 2014 big decision process 2011 use
Topic #1: big supply business chain analytics management 2012 decision operation capability 2014 use 2013 process customer
Topic #2: cid time 72 production manufacturing 87 76 81 68 operation process journal article 2016 al
Topic #3: manufacturing product chain production supply demand industry operation customer time consumer article management level approach



In [26]:
# Fit the LDA model with tf
lda = LatentDirichletAllocation(n_components=n_components )
lda.fit(tfMatrix)

tf_feature_names = tf_vectorizer.get_feature_names()
lda.components_
# tf_feature_names
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: manufacturing product customer production big al chain model et consumer time demand supply operation service
Topic #1: analytics technology business adoption use big supply management chain study model factor capability process decision
Topic #2: cid 72 87 68 76 81 time process 2017 production journal 2014 operation model approach
Topic #3: al et big 2017 2016 2015 2012 2014 review study 2013 2011 decision value analysis

