In [196]:
# improt modules
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [197]:
# set directory
dir = os.getcwd()
folder = 'data'
path = os.path.join(dir, folder)

# read each document with .txt extension in folder
files = None
try:
    files = os.listdir(path)
    files = [file for file in files if file.endswith('.txt')]
except:
    print('Folder not found')

In [198]:
# set pattern to filter text to lowercase
pattern = '[^a-zA-Z0-9]'

# create a corpus list of dictionary with key 'id' and 'text'
documents = []

# loop through each file
for index, file in enumerate(files):

    # read file
    with open(os.path.join(path, file), 'r') as f:
        
        # check if extension is .txt
        if file.endswith('.txt'):

            # read file
            text = f.read()
            
            # filter text to lowercase
            text = re.sub(pattern, ' ', text)
            text = text.lower()
            
            # append to corpus
            documents.append({'id': index, 'text': text})

In [199]:
def generate_corpus(documents: list) -> dict:
    '''
    Function to create a corpus from a list of documents
    
    Parameters
    ---
    documents: list
        list of documents
    
    Returns
    ---
    corpus: dict
        dictionary of corpus
    '''
    # create a set for the corpus
    corpus = set()

    # loop through each document, keep adding the words to the corpus
    for document in documents:
        corpus.update(document['text'].split())

    # create a dictionary with key 'word' and value 'index'
    corpus = dict([(word, index) for index, word in enumerate(corpus)])
    
    return corpus

In [200]:
# create and display corpus
corpus = generate_corpus(documents)
corpus

{'indonesia': 0,
 'kebangkitan': 1,
 'ayu': 2,
 'perang': 3,
 'raden': 4,
 'josephine': 5,
 'walanda': 6,
 'berjuang': 7,
 'dikenal': 8,
 'pelopor': 9,
 'maria': 10,
 'perempuan': 11,
 'kartini': 12,
 'nusantara': 13,
 'disebut': 14,
 'jawa': 15,
 'terhadap': 16,
 'dhien': 17,
 'perlawanannya': 18,
 'tarum': 19,
 'maramis': 20,
 'pribumi': 21,
 'cut': 22,
 'gle': 23,
 'ibrahim': 24,
 'lebih': 25,
 'usahanya': 26,
 'wanita': 27,
 'melawan': 28,
 'belanda': 29,
 'tokoh': 30,
 'yang': 31,
 'tewasnya': 32,
 'seorang': 33,
 'dan': 34,
 'dari': 35,
 'permulaan': 36,
 'kemudian': 37,
 'ke': 38,
 'adalah': 39,
 'aceh': 40,
 'menyeret': 41,
 'atau': 42,
 'catherine': 43,
 '20': 44,
 'pada': 45,
 'mengembangkan': 46,
 'adjeng': 47,
 'jauh': 48,
 'lamnga': 49,
 'abad': 50,
 'nasional': 51,
 'di': 52,
 'pahlawan': 53,
 'karena': 54,
 'nyak': 55,
 'keadaan': 56,
 'masa': 57,
 'untuk': 58,
 'sebenarnya': 59,
 'dalam': 60,
 'sebagai': 61,
 'tepat': 62}

In [201]:
def generate_vector(documents: list, corpus: dict) -> pd.DataFrame:
    '''
    Function to create vector for each document in corpus
    
    Parameters
    ---
    documents: dict
        Dictionary of documents with key 'id' and 'text'

    corpus: dict
        Dictionary of corpus with key 'word' and value 'index'
    '''

    # loop for each document in documents
    for document in documents:

        # generate list of the occurance for each word in corpus
        vector = [document['text'].split().count(word) for word in corpus.keys()]

        # set data to dictionary
        document['vector'] = np.array(vector).astype(float)
    
    # get data, index and columns for dataframe
    data = [document['vector'] for document in documents]
    index = [document['id'] for document in documents]
    columns = [word for word in corpus]

    # create dataframe
    df = pd.DataFrame(data=data, index=index, columns=columns)
    
    # return dataframe
    return df

In [202]:
# create a dataframe of vector for each document, with index as document id, and column as word in corpus
df = generate_vector(documents, corpus)
df

Unnamed: 0,indonesia,kebangkitan,ayu,perang,raden,josephine,walanda,berjuang,dikenal,pelopor,...,pahlawan,karena,nyak,keadaan,masa,untuk,sebenarnya,dalam,sebagai,tepat
0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [203]:
# describe dataframe
df.describe()

Unnamed: 0,indonesia,kebangkitan,ayu,perang,raden,josephine,walanda,berjuang,dikenal,pelopor,...,pahlawan,karena,nyak,keadaan,masa,untuk,sebenarnya,dalam,sebagai,tepat
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,1.333333,0.333333,0.333333,0.333333,0.666667,0.333333,0.333333,0.333333,0.666667,0.333333,...,1.0,0.333333,0.666667,0.333333,0.333333,0.333333,0.333333,0.333333,0.666667,0.333333
std,0.57735,0.57735,0.57735,0.57735,1.154701,0.57735,0.57735,0.57735,0.57735,0.57735,...,0.0,0.57735,1.154701,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.5,0.5,0.5,0.5,1.0,0.5,0.5,0.5,1.0,0.5,...,1.0,0.5,1.0,0.5,0.5,0.5,0.5,0.5,1.0,0.5
max,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [204]:
def generate_tfidf(matrix: pd.DataFrame, corpus: dict) -> pd.DataFrame:
    '''
    Function to calculate tf-idf for each word in each document

    tfidf = tf . log N / df

    - tf = term frequency
    - df = document frequency
    - N = number of documents

    Parameters
    ---
    matrix: pd.DataFrame
        matrix of vector for each document, with index as document id, and column as occurance of word in corpus

    corpus: dict
        Dictionary of corpus with key 'word' and value 'index'

    Returns
    ---
    tfidf: pd.DataFrame

    '''
    # get number of documents
    N = len(matrix)

    # get document frequency for each word in corpus
    df = matrix.astype(bool).sum(axis=0)

    # calculate tf-idf for each word in each document
    tfidf = matrix.apply(lambda x: x * np.log(N / df), axis=1)

    # return tf-idf dataframe
    return tfidf


In [205]:
tfidf = generate_tfidf(df, corpus)
tfidf

Unnamed: 0,indonesia,kebangkitan,ayu,perang,raden,josephine,walanda,berjuang,dikenal,pelopor,...,pahlawan,karena,nyak,keadaan,masa,untuk,sebenarnya,dalam,sebagai,tepat
0,0.0,1.098612,1.098612,0.0,2.197225,0.0,0.0,0.0,0.405465,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,1.098612,0.0,0.405465,1.098612
1,0.0,0.0,0.0,1.098612,0.0,0.0,0.0,1.098612,0.0,0.0,...,0.0,0.0,2.197225,0.0,1.098612,0.0,0.0,1.098612,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.098612,1.098612,0.0,0.405465,0.0,...,0.0,1.098612,0.0,1.098612,0.0,1.098612,0.0,0.0,0.405465,0.0


In [206]:
# export tfidf and vector to csv
tfidf.to_csv('tfidf.csv')
df.to_csv('vector.csv')