# Import Library

In [5]:
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

#import tensorflow as tf

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hrd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Import Dataset

In [6]:
data_ekspor     = pd.read_csv('EKSPOR.csv', sep= '|') 
data_impor      = pd.read_csv('IMPOR.csv', sep = '|')
data_wajib_pajak= pd.read_csv('WAJIB_PAJAK.csv', sep = '|')
data_faktur     = pd.read_csv('FAKTUR.csv', sep = '|')
data_obyek      = pd.read_csv('OBYEK.csv', sep = '|')

# Topic Modelling for data ekspor

# Asumsi : Semua Barang yang di Export pada data ini adalah kayu dan turunannya

In [7]:
data_ekspor.head()

Unnamed: 0,TGL_PEB,NPWP_EKSPORTIR,RINCIAN_BARANG,NILAI_FOBR
0,2019-02-22,576764283960555,BAGIAN RACK DARI KAYU ALBASIA DAN MDF,766375.0
1,2019-12-21,860034902531555,"LG,MD,SM RD BANANA BSKT NATXL,L,M,S BANANA BAR...",8951283.0
2,2019-03-26,566816002068555,INDONESIAN HARDWOOD PLYWOOD8.5 MM X 910MM X 18...,35100184.2
3,2019-03-30,564355431087555,FURNITURE ROTAN FULL,60453255.0
4,2019-11-01,569771154678555,BENCH,69967070.0


In [8]:
data_ekspor = data_ekspor[['RINCIAN_BARANG']]
data_ekspor['index'] = data_ekspor.index
documents = data_ekspor

documents.head()

Unnamed: 0,RINCIAN_BARANG,index
0,BAGIAN RACK DARI KAYU ALBASIA DAN MDF,0
1,"LG,MD,SM RD BANANA BSKT NATXL,L,M,S BANANA BAR...",1
2,INDONESIAN HARDWOOD PLYWOOD8.5 MM X 910MM X 18...,2
3,FURNITURE ROTAN FULL,3
4,BENCH,4


# Word Processing

## 1.Tokenized
## 2.Lemmatized
## 3.Words < 3 char removed
## 4.Stemming

In [9]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

## Test jika word processing berhasil

In [10]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []

for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))


original document: 
['ALBASIA', 'BARECOREGRADE', 'A2', 'AND', 'GRADE', 'B13MMX', '1220MM', 'X', '2440MM']


 tokenized and lemmatized document: 
['albasia', 'barecoregrad', 'grade']


# Preprocess semua rincian barang stemming,lemmatized, tokenized

In [11]:
processed_docs = documents['RINCIAN_BARANG'].map(preprocess)
processed_docs[:10] #Take one sample

0                  [bagian, rack, dari, kayu, albasia]
1    [banana, bskt, natxl, banana, bark, round, ver...
2                      [indonesian, hardwood, plywood]
3                                    [furnitur, rotan]
4                                              [bench]
5                                           [distress]
6    [chusion, dynasti, dayb, ottoman, hdfsunproof,...
7    [live, din, furnitur, mahogani, componen, head...
8                                       [wood, pellet]
9                   [falcata, plywood, grade, gluetyp]
Name: RINCIAN_BARANG, dtype: object

# Buat dictionary berapa banyak kata muncul dalam training set

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
dictionary

0 albasia
1 bagian
2 dari
3 kayu
4 rack
5 banana
6 bark
7 bskt
8 natxl
9 round
10 verticallink


<gensim.corpora.dictionary.Dictionary at 0x244c1d5d4c8>

## less than 15 documents (absolute number) or
## more than 0.5 documents (fraction of total corpus size, not absolute number).
## after the above two steps, keep only the first 100000 most frequent tokens.

In [13]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## For each document we create a dictionary reporting how many
## words and how many times those words appear. Save this to ‘bow_corpus’, 
## then check our selected document earlier.

In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(0, 1), (33, 1), (872, 1)]

In [15]:
#Test one sample
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 0 ("albasia") appears 1 time.
Word 33 ("grade") appears 1 time.
Word 872 ("barecoregrad") appears 1 time.


In [17]:
#Import LDA Model
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break
    
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=4)

[(0, 0.40452709231288003),
 (1, 0.5833766055377695),
 (2, 0.4151036130362214),
 (3, 0.3783021406822678),
 (4, 0.42497770352761444)]


## Print Semua Topik hasil LDA Topic Modelling 
## Ditampilkan 10 keywords dengan skor tertinggi untuk tiap Topik yang dibuat

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.190*"door" + 0.077*"engin" + 0.050*"paint" + 0.046*"brush" + 0.030*"pinus" + 0.025*"white" + 0.021*"queen" + 0.018*"meranti" + 0.014*"grid" + 0.013*"accessori"
Topic: 1 
Words: 0.060*"wooden" + 0.055*"chair" + 0.031*"frame" + 0.027*"furnitur" + 0.027*"storag" + 0.025*"wood" + 0.023*"king" + 0.021*"merbau" + 0.021*"rattan" + 0.021*"medium"
Topic: 2 
Words: 0.207*"plywood" + 0.170*"indonesian" + 0.056*"piec" + 0.055*"mmtotal" + 0.030*"item" + 0.024*"hardwood" + 0.021*"meranti" + 0.019*"veneer" + 0.019*"decor" + 0.016*"plywooditem"
Topic: 3 
Words: 0.113*"board" + 0.087*"albasia" + 0.062*"rail" + 0.056*"kayu" + 0.054*"dari" + 0.042*"core" + 0.042*"bare" + 0.039*"particl" + 0.036*"rack" + 0.033*"dresser"
Topic: 4 
Words: 0.041*"tabl" + 0.038*"grade" + 0.032*"lamin" + 0.027*"joint" + 0.024*"teak" + 0.024*"finger" + 0.021*"kursi" + 0.020*"wood" + 0.020*"rotan" + 0.016*"blockboard"


In [17]:
#lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
#for idx, topic in lda_model_tfidf.print_topics(-1):
#    print('Topic: {} Word: {}'.format(idx, topic))

In [19]:
#Performance evaluation by classifying sample document using LDA Bag of Words model
processed_docs[100]

['sunbella', 'seat', 'cushion', 'cover']

In [21]:
#Take One sample to be tested
for index, score in sorted(lda_model[bow_corpus[1000]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5999248623847961	 
Topic: 0.207*"plywood" + 0.170*"indonesian" + 0.056*"piec" + 0.055*"mmtotal" + 0.030*"item" + 0.024*"hardwood" + 0.021*"meranti" + 0.019*"veneer" + 0.019*"decor" + 0.016*"plywooditem"

Score: 0.10006904602050781	 
Topic: 0.060*"wooden" + 0.055*"chair" + 0.031*"frame" + 0.027*"furnitur" + 0.027*"storag" + 0.025*"wood" + 0.023*"king" + 0.021*"merbau" + 0.021*"rattan" + 0.021*"medium"

Score: 0.10000266134738922	 
Topic: 0.113*"board" + 0.087*"albasia" + 0.062*"rail" + 0.056*"kayu" + 0.054*"dari" + 0.042*"core" + 0.042*"bare" + 0.039*"particl" + 0.036*"rack" + 0.033*"dresser"

Score: 0.1000022441148758	 
Topic: 0.041*"tabl" + 0.038*"grade" + 0.032*"lamin" + 0.027*"joint" + 0.024*"teak" + 0.024*"finger" + 0.021*"kursi" + 0.020*"wood" + 0.020*"rotan" + 0.016*"blockboard"

Score: 0.10000120103359222	 
Topic: 0.190*"door" + 0.077*"engin" + 0.050*"paint" + 0.046*"brush" + 0.030*"pinus" + 0.025*"white" + 0.021*"queen" + 0.018*"meranti" + 0.014*"grid" + 0.013*"access

# Tes model untuk Unseen Text / Document

In [23]:
unseen_document = 'BAGIAN RACK DARI KAYU ALBASIA DAN MDF'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8665812015533447	 Topic: 0.113*"board" + 0.087*"albasia" + 0.062*"rail" + 0.056*"kayu" + 0.054*"dari"
Score: 0.03337952867150307	 Topic: 0.207*"plywood" + 0.170*"indonesian" + 0.056*"piec" + 0.055*"mmtotal" + 0.030*"item"
Score: 0.03336125984787941	 Topic: 0.190*"door" + 0.077*"engin" + 0.050*"paint" + 0.046*"brush" + 0.030*"pinus"
Score: 0.03334445506334305	 Topic: 0.041*"tabl" + 0.038*"grade" + 0.032*"lamin" + 0.027*"joint" + 0.024*"teak"
Score: 0.033333539962768555	 Topic: 0.060*"wooden" + 0.055*"chair" + 0.031*"frame" + 0.027*"furnitur" + 0.027*"storag"


# Visualize LDA Model using PyLDAvis Library

In [27]:
import gensim
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim_models.prepare(lda_model, corpus_tfidf, dictionary)
print(data)
pyLDAvis.save_html(data, 'lda-gensim_ver3.html')


  and should_run_async(code)


PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
4     -0.077407  0.194040       1        1  22.574169
2     -0.055212 -0.249859       2        1  21.770430
1     -0.099047  0.269914       3        1  20.384041
0     -0.176231 -0.197368       4        1  19.824736
3      0.407897 -0.016728       5        1  15.446625, topic_info=            Term          Freq         Total Category  logprob  loglift
14       plywood  16052.000000  16052.000000  Default  30.0000  30.0000
47          door  13129.000000  13129.000000  Default  29.0000  29.0000
13    indonesian  13074.000000  13074.000000  Default  28.0000  28.0000
36         board   6296.000000   6296.000000  Default  27.0000  27.0000
48         engin   5334.000000   5334.000000  Default  26.0000  26.0000
...          ...           ...           ...      ...      ...      ...
146        panel   1333.684947   1921.580366   Topic5  -3.6973   1.5026
233

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Save top words to csv file

In [28]:
import pandas as pd
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])
df = pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P'])
pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word','P']).to_csv("top_words2.csv")
df

  and should_run_async(code)


Unnamed: 0,Topic,Word,P
0,0,door,0.190099
1,0,engin,0.077227
2,0,paint,0.049931
3,0,brush,0.045978
4,0,pinus,0.02992
5,0,white,0.025133
6,0,queen,0.021341
7,0,meranti,0.018384
8,0,grid,0.013704
9,0,accessori,0.013393


# Model LDA hasil training menggunakan document/text pada data export akan ditest dengan document/text pada rincian barang tiap NPWP pada tiap masa pajak

# Load Data Barang tiap NPWP tiap masa pajak

In [22]:
data_barang = pd.read_csv('Data Barang tiap masa pajak tiap NPWP ready for LDA.csv')
data_barang

Unnamed: 0.1,Unnamed: 0,NPWP,NAMA_BARANG_1,NAMA_BARANG_2,NAMA_BARANG_3,NAMA_BARANG_4,NAMA_BARANG_5,NAMA_BARANG_6,NAMA_BARANG_7,NAMA_BARANG_8,NAMA_BARANG_9,NAMA_BARANG_10,NAMA_BARANG_11,NAMA_BARANG_12
0,0,505492060555,"LOG BULAT', 'Kayu Bulat Kel. Rimba Campur Grad...","Kayu Bulat Kel. Rimba Campur Grade A', 'Kayu B...",Bengkirai/Super/STMkering/1.9990 1.9 cm x 9 cm...,"Kayu Bulat Kel. Meranti (Grade C)', 'Kayu bula...","Kayu bulat Kel. Rimba Campur (Grade A)', 'Kayu...","Kayu Bulat Kel. Meranti Grade C', 'Kayu Bulat ...","Kayu Bulat Kel. Meranti Grade C', 'PENJUALAN S...","Kayu Bulat Kel. Rimba Campur Grade A', 'Kayu B...","Kayu Bulat Kel. Meranti (Grade C)', 'Kayu bula...","KAYU BULAT MERANTI', 'Bengkirai/Super/STMkerin...","KAYU BULAT RIMBA CAMPURAN', 'KAYU BULAT MERANT...","KAYU BULAT KEL.MERANTI 28,92 M3', 'KAYU GERGAJ..."
1,1,513262052555,"Veneer Jati 0,6 All Grade Uk : 0,6 x 9~25 x 14...",0,0,0,0,0,0,0,0,0,0,0
2,2,2100122108555,"ARTIS BRUSH 6', 'ARTIS BRUSH ...",0,"KUAS ARTIST BRUSH 12 IN', 'KUAS ARTIST BRUSH 0...","ETERNA 633 VERNISH 4 INCH', 'ETERNA 633 VERNIS...","ETERNA 633 VERNISH 1,5 INCH', 'ETERNA 633 VERN...","KUAS E.633 2,5""', 'KUAS E.633 3/4""',...",0,"ETERNA 633 VERNISH 2 INCH', 'KUAS ARTIST BRUSH...",0,0,"KUAS ARTIST BRUSH 03 IN', 'ETERNA 633 VERNISH ...","ETERNA 633 VERNISH 3 INCH', 'ETERNA 633 VERNIS..."
3,3,2130386761555,0,0,0,0,0,0,"FULL SET KS FL SMILE KIDS 120,BOLSTER FLORENCE...","FULL SET KS SR M16 PERFECTION (ET) 200,PILLOW ...",0,0,0,0
4,4,2825102452555,0,0,0,0,0,0,0,"Portable Massage bed', 'Portable Massage Bed",0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26385,26385,991830669966556,"Busa Kulit Tipis', 'Busa BS/Super 200x100x30 C...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa S...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...","Busa Kulit Tipis', 'Busa Ancuran', 'Busa Kulit...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...",Busa Kulit Tipis,"Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...","Busa Kulit Tipis', 'Busa Kulit Tipis', 'Busa K...",Busa Kulit Tipis
26386,26386,992307921068555,LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3),LOG ALBASIA 16-30 UP (M3)
26387,26387,994306253068555,0,0,"Usuk Kayu Keras Uk. 4 cm x 4 cm x 3 m', 'Palet...",Palet Tutup Kaki Tiga Besar Uk. 120 x 110 x 14...,"Kayu Persegi Uk. 5 x 5x 3 m', 'Pallet Kaki Tig...",0,Pallet Kaki Tiga Besar Tanpa Tutup Uk. 120 x 1...,Palet Tutup Kaki Tiga Besar Uk. 120 x 110 x 14...,0,Palet Tutup Kaki Tiga Besar Uk. 120 x 110 x 14...,Palet Tutup Kaki Tiga Besar Uk. 120 x 110 x 14...,Palet Tutup Kaki Tiga Besar Uk. 120 x 110 x 14...
26388,26388,998947805062555,0,0,0,0,BIAYA BELANJA BARANG (PERLENGKAPAN) DALAM RANG...,0,0,0,0,0,0,Biaya belanja barang (perlengkapan) dalam rang...


In [59]:
data_barang.loc[data_barang['NPWP']==2130386761555]

Unnamed: 0.1,Unnamed: 0,NPWP,NAMA_BARANG_1,NAMA_BARANG_2,NAMA_BARANG_3,NAMA_BARANG_4,NAMA_BARANG_5,NAMA_BARANG_6,NAMA_BARANG_7,NAMA_BARANG_8,NAMA_BARANG_9,NAMA_BARANG_10,NAMA_BARANG_11,NAMA_BARANG_12
3,3,2130386761555,0,0,0,0,0,0,"FULL SET KS FL SMILE KIDS 120,BOLSTER FLORENCE...","FULL SET KS SR M16 PERFECTION (ET) 200,PILLOW ...",0,0,0,0


In [49]:
nilai_topik = np.empty((data_barang.shape[0], 12))
nilai_topik[0,0]=1

nilai_topik

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Ambil nilai score topik 1-5 untuk tiap rincian barang data ekspor

In [53]:
from tqdm import tqdm 

NPWP=[]
lenx=[]
nilai_topik_1 = []
nilai_topik_2 = []
nilai_topik_3 = []
nilai_topik_4 = []
nilai_topik_5 = []
nilai_topik_6 = []
nilai_topik_7 = []
nilai_topik_8 = []
nilai_topik_9 = []
nilai_topik_10 = []
nilai_topik_11 = []
nilai_topik_12 = []

for i in tqdm(range(data_barang.shape[0])):
    
    NPWP.append(data_barang.NPWP.iloc[i])
    
    if data_barang['NAMA_BARANG_1'].iloc[i] == '0':
        nilai_topik_1.append(0)
    else:
        bow_vector1 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_1'].iloc[i]))
        hasil_lda1 = lda_model[bow_vector1]
        nilai_topik_1.append(hasil_lda1)
    
    if data_barang['NAMA_BARANG_2'].iloc[i] == '0':
        nilai_topik_2.append(0)
    else:
        bow_vector2 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_2'].iloc[i]))
        hasil_lda2 = lda_model[bow_vector2]
        nilai_topik_2.append(hasil_lda2)
        
    if data_barang['NAMA_BARANG_3'].iloc[i] == '0':
        nilai_topik_3.append(0)
    else:
        bow_vector3 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_3'].iloc[i]))
        hasil_lda3 = lda_model[bow_vector3]
        nilai_topik_3.append(hasil_lda3)
    
    if data_barang['NAMA_BARANG_4'].iloc[i] == '0':
        nilai_topik_4.append(0)
    else:
        bow_vector4 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_4'].iloc[i]))
        hasil_lda4 = lda_model[bow_vector4]
        nilai_topik_4.append(hasil_lda4)
    
    if data_barang['NAMA_BARANG_5'].iloc[i] == '0':
        nilai_topik_5.append(0)
    else:
        bow_vector5 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_5'].iloc[i]))
        hasil_lda5 = lda_model[bow_vector5]
        nilai_topik_5.append(hasil_lda5)
    
    if data_barang['NAMA_BARANG_6'].iloc[i] == '0':
        nilai_topik_6.append(0)
    else:
        bow_vector6 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_6'].iloc[i]))
        hasil_lda6 = lda_model[bow_vector6]
        nilai_topik_6.append(hasil_lda6)
    
    if data_barang['NAMA_BARANG_7'].iloc[i] == '0':
        nilai_topik_7.append(0)
    else:
        bow_vector7 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_7'].iloc[i]))
        hasil_lda7 = lda_model[bow_vector7]
        nilai_topik_7.append(hasil_lda7)
    
    if data_barang['NAMA_BARANG_8'].iloc[i] == '0':
        nilai_topik_8.append(0)
    else:
        bow_vector8 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_8'].iloc[i]))
        hasil_lda8 = lda_model[bow_vector8]
        nilai_topik_8.append(hasil_lda8)
    
    if data_barang['NAMA_BARANG_9'].iloc[i] == '0':
        nilai_topik_9.append(0)
    else:
        bow_vector9 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_9'].iloc[i]))
        hasil_lda9 = lda_model[bow_vector9]
        nilai_topik_9.append(hasil_lda9)
    
    if data_barang['NAMA_BARANG_10'].iloc[i] == '0':
        nilai_topik_10.append(0)
    else:
        bow_vector10 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_10'].iloc[i]))
        hasil_lda10 = lda_model[bow_vector10]
        nilai_topik_10.append(hasil_lda10)
    
    if data_barang['NAMA_BARANG_11'].iloc[i] == '0':
        nilai_topik_11.append(0)
    else:
        bow_vector11 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_11'].iloc[i]))
        hasil_lda11 = lda_model[bow_vector11]
        nilai_topik_11.append(hasil_lda11)
    
    if data_barang['NAMA_BARANG_12'].iloc[i] == '0':
        nilai_topik_12.append(0)
    else:
        bow_vector12 = dictionary.doc2bow(preprocess(data_barang['NAMA_BARANG_12'].iloc[i]))
        hasil_lda12 = lda_model[bow_vector12]
        nilai_topik_12.append(hasil_lda12)
        
pd_cust_topic_model = pd.DataFrame()
pd_cust_topic_model['NPWP'] = NPWP

pd_cust_topic_model['score_1'] = nilai_topik_1
pd_cust_topic_model['score_2'] = nilai_topik_2
pd_cust_topic_model['score_3'] = nilai_topik_3
pd_cust_topic_model['score_4'] = nilai_topik_4
pd_cust_topic_model['score_5'] = nilai_topik_5
pd_cust_topic_model['score_6'] = nilai_topik_6
pd_cust_topic_model['score_7'] = nilai_topik_7
pd_cust_topic_model['score_8'] = nilai_topik_8
pd_cust_topic_model['score_9'] = nilai_topik_9
pd_cust_topic_model['score_10'] = nilai_topik_10
pd_cust_topic_model['score_11'] = nilai_topik_11
pd_cust_topic_model['score_12'] = nilai_topik_12
    
pd_cust_topic_model    
    
    
    
    
    
'''    
    data_text.append(unseen_document.NAMA_BARANG_1.iloc[i])
    lenx.append(len(hasil_lda))
    nilai_topik.append(hasil_lda)
    
    
    data_text.append(unseen_document.RINCIAN_BARANG.iloc[0])
    
    topik1.append(hasil_lda[0])
    topik2.append(hasil_lda[1])
    topik3.append(hasil_lda[2])
    topik4.append(hasil_lda[3])
    topik5.append(hasil_lda[4])
    
    
pd_topik_eks = pd.DataFrame()

pd_topik_eks['RINCIAN_BARANG'] = data_text
pd_topik_eks['len'] = lenx
pd_topik_eks['score topik 1-5'] = nilai_topik


pd_topik_eks['topik1_val']  = topik1
pd_topik_eks['topik2_val']  = topik2
pd_topik_eks['topik3_val']  = topik3
pd_topik_eks['topik4_val']  = topik4
pd_topik_eks['topik5_val']  = topik5


pd_topik_eks
'''

100%|████████████████████████████████████████████████████████████████████████████| 26390/26390 [05:34<00:00, 78.85it/s]


"    \n    data_text.append(unseen_document.NAMA_BARANG_1.iloc[i])\n    lenx.append(len(hasil_lda))\n    nilai_topik.append(hasil_lda)\n    \n    \n    data_text.append(unseen_document.RINCIAN_BARANG.iloc[0])\n    \n    topik1.append(hasil_lda[0])\n    topik2.append(hasil_lda[1])\n    topik3.append(hasil_lda[2])\n    topik4.append(hasil_lda[3])\n    topik5.append(hasil_lda[4])\n    \n    \npd_topik_eks = pd.DataFrame()\n\npd_topik_eks['RINCIAN_BARANG'] = data_text\npd_topik_eks['len'] = lenx\npd_topik_eks['score topik 1-5'] = nilai_topik\n\n\npd_topik_eks['topik1_val']  = topik1\npd_topik_eks['topik2_val']  = topik2\npd_topik_eks['topik3_val']  = topik3\npd_topik_eks['topik4_val']  = topik4\npd_topik_eks['topik5_val']  = topik5\n\n\npd_topik_eks\n"

In [54]:
pd_cust_topic_model

Unnamed: 0,NPWP,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10,score_11,score_12
0,505492060555,"[(2, 0.23800582), (3, 0.2030941), (4, 0.547371...","[(2, 0.1978361), (3, 0.24275233), (4, 0.5478422)]","[(0, 0.19549634), (1, 0.015388406), (2, 0.0157...","[(0, 0.56485844), (3, 0.089505784), (4, 0.3273...","[(0, 0.3331184), (3, 0.22964992), (4, 0.418798...","[(2, 0.18520235), (3, 0.2464701), (4, 0.560456)]","[(0, 0.23167455), (1, 0.010113497), (2, 0.0103...","[(0, 0.017179599), (1, 0.016669711), (2, 0.123...","[(0, 0.012814342), (1, 0.012502375), (2, 0.268...","[(0, 0.7937479), (1, 0.050003543), (2, 0.05146...","[(0, 0.31879142), (1, 0.01349862), (2, 0.01363...","[(0, 0.53792334), (1, 0.029123433), (2, 0.0293..."
1,513262052555,"[(0, 0.05053121), (1, 0.050025042), (2, 0.3072...",0,0,0,0,0,0,0,0,0,0,0
2,2100122108555,"[(0, 0.9103849), (1, 0.022224957), (2, 0.02222...",0,"[(0, 0.9264416), (1, 0.018184796), (2, 0.01818...","[(0, 0.73311096), (1, 0.066722155), (2, 0.0667...","[(0, 0.60632), (1, 0.028615221), (2, 0.1710708...","[(0, 0.05058447), (1, 0.05000592), (2, 0.05000...",0,"[(0, 0.86605716), (1, 0.03335216), (2, 0.03335...",0,0,"[(0, 0.8654074), (1, 0.033343222), (2, 0.03334...","[(0, 0.8848638), (1, 0.02858248), (2, 0.028582..."
3,2130386761555,0,0,0,0,0,0,"[(0, 0.0334574), (1, 0.033455368), (2, 0.53547...","[(0, 0.033362344), (1, 0.033361588), (2, 0.033...",0,0,0,0
4,2825102452555,0,0,0,0,0,0,0,"[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0...",0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26385,991830669966556,"[(0, 0.34656003), (1, 0.013338484), (2, 0.0133...","[(0, 0.17116061), (1, 0.028583374), (2, 0.0285...","[(0, 0.03334054), (1, 0.033340555), (2, 0.0333...","[(0, 0.050010767), (1, 0.050010793), (2, 0.050...","[(0, 0.040008627), (1, 0.04000865), (2, 0.0400...","[(0, 0.025005428), (1, 0.025005441), (2, 0.025...","[(0, 0.050010763), (1, 0.05001079), (2, 0.0500...","[(0, 0.040008627), (1, 0.04000865), (2, 0.0400...","[(0, 0.10002269), (1, 0.10002275), (2, 0.10002...","[(0, 0.03334054), (1, 0.033340555), (2, 0.0333...","[(0, 0.03334054), (1, 0.033340555), (2, 0.0333...","[(0, 0.100022696), (1, 0.100022756), (2, 0.100..."
26386,992307921068555,"[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.10000099), (1, 0.100000225), (2, 0.1000...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.10000099), (1, 0.100000225), (2, 0.1000...","[(0, 0.10000101), (1, 0.10000024), (2, 0.10000...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.10000099), (1, 0.100000225), (2, 0.1000...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.100001), (1, 0.10000023), (2, 0.1000003...","[(0, 0.10000101), (1, 0.10000024), (2, 0.10000..."
26387,994306253068555,0,0,"[(0, 0.068301365), (1, 0.06647), (2, 0.8428334...","[(0, 0.018189808), (1, 0.018189829), (2, 0.927...","[(0, 0.23039722), (1, 0.21841776), (2, 0.53896...",0,"[(0, 0.12336411), (1, 0.12356312), (2, 0.74126...","[(0, 0.020010427), (1, 0.119699344), (2, 0.820...",0,"[(0, 0.010005147), (1, 0.109804004), (2, 0.860...","[(0, 0.02223196), (1, 0.022231985), (2, 0.9110...","[(0, 0.018191306), (1, 0.1087994), (2, 0.83648..."
26388,998947805062555,0,0,0,0,"[(0, 0.0153882215), (1, 0.015388674), (2, 0.32...",0,0,0,0,0,0,"[(1, 0.17508514), (2, 0.23592676), (3, 0.11196..."


# Save Data Hasil Scoring

In [57]:
pd_cust_topic_model.to_csv('Data Hasil Scoring Nama Barang semua NPWP 1-5 per masa pajak.csv')