In [1]:
# improt modules
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [2]:
# set directory
dir = os.getcwd()
folder = 'data'
path = os.path.join(dir, folder)

# read each document with .txt extension in folder
files = None
try:
    files = os.listdir(path)
    files = [file for file in files if file.endswith('.txt')]
except:
    print('Folder not found')

# set pattern to filter text to lowercase
pattern = '[^a-zA-Z0-9]'

# create a corpus list of dictionary with key 'id' and 'text'
documents = []
for index, file in enumerate(files):
    with open(os.path.join(path, file), 'r') as f:
        # check if extension is .txt
        if file.endswith('.txt'):
            # read file
            text = f.read()
            # filter text to lowercase
            text = re.sub(pattern, ' ', text)
            text = text.lower()
            # append to corpus
            documents.append({'id': index, 'text': text})

In [3]:
# combine all text into a single string
combine = ''.join([document['text'] for document in documents])

# split get the unique words in corpus using set
words = set(combine.split())

# create a dictionary with key 'word' and value 'index'
corpus = dict([(word, index) for index, word in enumerate(words)])

In [4]:
# display codpus
corpus

{'dhien': 0,
 'pelopor': 1,
 'sebenarnya': 2,
 'maria': 3,
 'walanda': 4,
 'kebangkitan': 5,
 'pribumi': 6,
 'wanita': 7,
 'keadaan': 8,
 'lamnga': 9,
 'cut': 10,
 'di': 11,
 'ayu': 12,
 'masa': 13,
 'yang': 14,
 'sebagai': 15,
 'melawan': 16,
 'menyeret': 17,
 'aceh': 18,
 'karena': 19,
 'tokoh': 20,
 'tepat': 21,
 'perempuan': 22,
 'gle': 23,
 'kemudian': 24,
 'terhadap': 25,
 'nyak': 26,
 'mengembangkan': 27,
 'nasional': 28,
 'tewasnya': 29,
 'adalah': 30,
 'pada': 31,
 'atau': 32,
 'jawa': 33,
 'raden': 34,
 'dari': 35,
 'dalam': 36,
 'usahanya': 37,
 'permulaan': 38,
 'ke': 39,
 'dikenal': 40,
 'abad': 41,
 'ibrahim': 42,
 'berjuang': 43,
 '20': 44,
 'belanda': 45,
 'jauh': 46,
 'lebih': 47,
 'josephine': 48,
 'perang': 49,
 'kartini': 50,
 'nusantara': 51,
 'catherine': 52,
 'dan': 53,
 'indonesia': 54,
 'disebut': 55,
 'perlawanannya': 56,
 'seorang': 57,
 'adjeng': 58,
 'pahlawan': 59,
 'tarum': 60,
 'untuk': 61,
 'maramis': 62}

In [5]:
# create a vector with occurance of each word in each document
for document in documents:
    vector = np.zeros(len(corpus))
    for word in document['text'].split():
        vector[corpus[word]] += 1
    document['vector'] = vector

In [6]:
# create a dataframe of vector for each document, with index as document id, and column as word in corpus
data = [document['vector'] for document in documents]
index = [document['id'] for document in documents]
columns = [word for word in corpus]

df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0,dhien,pelopor,sebenarnya,maria,walanda,kebangkitan,pribumi,wanita,keadaan,lamnga,...,dan,indonesia,disebut,perlawanannya,seorang,adjeng,pahlawan,tarum,untuk,maramis
0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0


In [7]:
# export dataframe to csv file
filename = 'vector.csv'
filepath = os.path.join(dir, filename)
df.to_csv(filepath)