In [258]:
# improt modules
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [259]:
# set directory
dir = os.getcwd()
folder = 'data'
path = os.path.join(dir, folder)

# read each document with .txt extension in folder
files = list()

try:
    files = os.listdir(path)
    files = [file for file in files if file.endswith('.txt')]
except:
    print('Folder not found')

In [260]:
# set pattern to filter text to lowercase
pattern = '[^a-zA-Z0-9]'

# create a corpus list of dictionary with key 'id' and 'text'
documents = list()

# loop through each file
for index, file in enumerate(files):

    # read file
    with open(os.path.join(path, file), 'r') as f:
        
        # check if extension is .txt
        if file.endswith('.txt'):

            # read file
            text = f.read()
            
            # filter text to lowercase
            text = re.sub(pattern, ' ', text)
            text = text.lower()
            
            # append to corpus
            documents.append({'id': index, 'text': text})

In [261]:
def generate_corpus(documents: list) -> dict:
    '''
    Function to create a corpus from a list of documents
    
    Parameters
    ---
    documents: list
        list of documents
    
    Returns
    ---
    corpus: dict
        dictionary of corpus
    '''
    # create a set for the corpus
    corpus = set()

    # loop through each document, keep adding the words to the corpus
    for document in documents:
        corpus.update(document['text'].split())

    # create a dictionary with key 'word' and value 'index'
    corpus = dict([(word, index + 1) for index, word in enumerate(corpus)])
    
    return corpus

In [262]:
# create and display corpus
corpus = generate_corpus(documents)
corpus

{'pada': 1,
 'keadaan': 2,
 'aceh': 3,
 'terhadap': 4,
 'abad': 5,
 'dari': 6,
 'adjeng': 7,
 'kemudian': 8,
 'lebih': 9,
 'yang': 10,
 'jawa': 11,
 'walanda': 12,
 'belanda': 13,
 'karena': 14,
 'perlawanannya': 15,
 'indonesia': 16,
 'permulaan': 17,
 'ke': 18,
 'di': 19,
 'ayu': 20,
 'ibrahim': 21,
 'catherine': 22,
 'masa': 23,
 'nusantara': 24,
 '20': 25,
 'sebenarnya': 26,
 'pelopor': 27,
 'maramis': 28,
 'usahanya': 29,
 'dhien': 30,
 'pribumi': 31,
 'melawan': 32,
 'wanita': 33,
 'tarum': 34,
 'raden': 35,
 'tewasnya': 36,
 'seorang': 37,
 'maria': 38,
 'tokoh': 39,
 'sebagai': 40,
 'josephine': 41,
 'disebut': 42,
 'pahlawan': 43,
 'berjuang': 44,
 'menyeret': 45,
 'jauh': 46,
 'mengembangkan': 47,
 'dalam': 48,
 'untuk': 49,
 'cut': 50,
 'lamnga': 51,
 'kebangkitan': 52,
 'gle': 53,
 'nasional': 54,
 'nyak': 55,
 'kartini': 56,
 'tepat': 57,
 'perang': 58,
 'atau': 59,
 'dan': 60,
 'dikenal': 61,
 'adalah': 62,
 'perempuan': 63}

In [263]:
def generate_vector(documents: list, corpus: dict) -> pd.DataFrame:
    '''
    Function to create vector for each document in corpus
    
    Parameters
    ---
    documents: dict
        Dictionary of documents with key 'id' and 'text'

    corpus: dict
        Dictionary of corpus with key 'word' and value 'index'
    '''

    # loop for each document in documents
    for document in documents:

        # generate list of the occurance for each word in corpus
        vector = [document['text'].split().count(word) for word in corpus.keys()]

        # set data to dictionary
        document['vector'] = np.array(vector).astype(float)
    
    # get data, index and columns for dataframe
    data = [document['vector'] for document in documents]
    index = [document['id'] for document in documents]
    columns = [word for word in corpus.keys()]

    # create dataframe
    df = pd.DataFrame(data=data, index=index, columns=columns)
    
    # return dataframe
    return df

In [264]:
# create a dataframe of vector for each document, with index as document id, and column as word in corpus
df = generate_vector(documents, corpus)

# print first 10 columns
df.iloc[:, :9]

Unnamed: 0,pada,keadaan,aceh,terhadap,abad,dari,adjeng,kemudian,lebih
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [265]:
def generate_tfidf(matrix: pd.DataFrame, corpus: dict) :
    '''
    Function to calculate tf-idf for each word in each document

    tfidf = tf . log N / df
    tf = log 1 + tf

    - tf = term frequency
    - df = document frequency
    - N = number of documents

    Parameters
    ---
    matrix: pd.DataFrame
        matrix of vector for each document, with index as document id, and column as occurance of word in corpus

    corpus: dict
        Dictionary of corpus with key 'word' and value 'index'

    Returns
    ---
    tfidf: pd.DataFrame

    '''
    # get number of documents
    N = len(matrix)

    # get document frequency for each word in corpus
    df = matrix.astype(bool).sum(axis=0)

    # get term frequency for each word in each document
    tf = matrix / matrix.sum(axis=1).values.reshape(-1, 1)

    # calculate tf-idf
    tfidf = tf * np.log(N / df)

    # return tf-idf dataframe
    return tfidf


In [266]:
tfidf = generate_tfidf(df, corpus)
tfidf.iloc[:, :9]

Unnamed: 0,pada,keadaan,aceh,terhadap,abad,dari,adjeng,kemudian,lebih
0,0.0,0.0,0.0,0.0,0.0,0.0,0.040689,0.0,0.0
1,0.011585,0.0,0.062778,0.031389,0.0,0.031389,0.0,0.031389,0.0
2,0.013516,0.03662,0.0,0.0,0.03662,0.0,0.0,0.0,0.0


In [267]:
# export tfidf and vector to csv
tfidf.to_csv('tfidf.csv')
df.to_csv('vector.csv')