In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pathlib
import string
from collections import Counter
from natsort import natsorted
import pandas as pd

In [2]:
global data_dir
data_dir = []

In [3]:
def read_data(dir_path):
    # read all files based on given input dir path
    # INPUT: dataset path
    # OUTPUT: dataset
    data = []
    temp = []
    # asign the path
    source_dir = pathlib.Path(dir_path)
    
    # read the path
    for file in source_dir.iterdir():
        temp.append(file.name)
#         print("Dataset found on: ", file)
    
    x = natsorted(temp)
    for item in x:
        y = str(dir_path) + '/' + str(item)
        data_dir.append(y)
    
    # read the files
    for item in data_dir:
        print(item)
        temp = open(item, "r", encoding="utf-8")
        row = temp.read()
        data.append(row)
        temp.close()
    
    return data

In [4]:
def grp_tokenization(dataset):
    # tokenization the merged dataset
    # INPUT: dataset string
    # OUTPUT: tokened dataset list
    
    # remove punctuation from the dataset
    temp = dataset.lower()
    punctuation_trimmer = temp.translate(str.maketrans('','',string.punctuation))
    return(nltk.word_tokenize(punctuation_trimmer))

In [5]:
def tokenization(txtFile):
    # tokenization the dataset array
    # INPUT: dataset array
    # OUTPUT: tokened dataset array
    tok_data = []
    
    # remove punctuation from the dataset
    for item in txtFile:
        tok_data.append(grp_tokenization(item))
    
    return tok_data

In [6]:
def form_GrpCounter(tokens):
    # count the tokens in the token dataset
    # INPUT: token
    # OUTPUT: counted token list
    return Counter(tokens)

In [7]:
def form_counter(tokens):
    # count the tokens in the token dataset
    # INPUT: token array
    # OUTPUT: counted token array
    count = []
    
    for item in tokens:
        count.append(form_GrpCounter(item))
    
    return count

In [8]:
def merger_dataset(dataset):
    # merge all files inside the given dataset
    # INPUT: dataset array
    # OUTPUT: dataset string
    data = ""
    for txt in dataset:
        data = data + txt

    return data


In [9]:
dataset_path = "./Data"
dataset = read_data(dataset_path)
tokens = tokenization(dataset)
count = form_counter(tokens)

./Data/a1.txt
./Data/a2.txt
./Data/a3.txt
./Data/a4.txt
./Data/a5.txt
./Data/a6.txt
./Data/a7.txt
./Data/a8.txt
./Data/a9.txt
./Data/a10.txt


In [10]:
# for item in count:
#     for key, val in item.items():
#         print(key, ", ", val)

In [11]:
temp_counter = 0

for item in tokens:
    print(data_dir[temp_counter], " has: ", len(item), " of tokens")
    temp_counter = temp_counter + 1

./Data/a1.txt  has:  1672  of tokens
./Data/a2.txt  has:  4223  of tokens
./Data/a3.txt  has:  1905  of tokens
./Data/a4.txt  has:  2885  of tokens
./Data/a5.txt  has:  2998  of tokens
./Data/a6.txt  has:  1542  of tokens
./Data/a7.txt  has:  1421  of tokens
./Data/a8.txt  has:  1591  of tokens
./Data/a9.txt  has:  1401  of tokens
./Data/a10.txt  has:  1247  of tokens


#1 ans: above is the number of tokens in each doc

In [12]:
temp_counter = 0

for item in count:
    print(data_dir[temp_counter], " has: ", len(item), " of unique tokens")
    temp_counter = temp_counter + 1

./Data/a1.txt  has:  612  of unique tokens
./Data/a2.txt  has:  1249  of unique tokens
./Data/a3.txt  has:  706  of unique tokens
./Data/a4.txt  has:  969  of unique tokens
./Data/a5.txt  has:  1050  of unique tokens
./Data/a6.txt  has:  615  of unique tokens
./Data/a7.txt  has:  591  of unique tokens
./Data/a8.txt  has:  665  of unique tokens
./Data/a9.txt  has:  568  of unique tokens
./Data/a10.txt  has:  536  of unique tokens


#1 ans: above is number of unique tokens in each doc

In [13]:
merged_dataset = merger_dataset(dataset)
merged_token = grp_tokenization(merged_dataset)
merged_count = form_GrpCounter(merged_token)

In [14]:
len(merged_token)

20885

#1 ans: There are total 20885 tokens in the entire corpus

In [15]:
len(merged_count)

3792

#1 ans: There are total 3792 unique tokens in the entire corpus

////////////////////////////////////////////////// End #1 //////////////////////////////////////////////////

In [16]:
from nltk.corpus import stopwords

In [17]:
def stopWrd_trimmer(dataset):
    # remove stop words from the dataset
    # INPUT: dataset array
    # OUTPUT: stop-words-removed dataset array
    stopWrd = []
    for file in dataset:
        stopWrd.append([wrd for wrd in file if not wrd in stopwords.words('english')])

    return stopWrd

In [18]:
def grp_stopWrd_trimmer(dataset):
    # remove stop words from the dataset
    # INPUT: merged dataset
    # OUTPUT: stop-words-removed dataset
    return ([wrd for wrd in dataset if not wrd in stopwords.words('english')])

In [19]:
# stop word for the doc collection
stopWrd = stopWrd_trimmer(tokens)
# stop word for the corpus
stopWrd_grp = grp_stopWrd_trimmer(merged_token)

In [20]:
temp_counter = 0

for item in stopWrd:
    print("Stop-words-removed for", data_dir[temp_counter], " has: ", len(item), " of tokens")
    temp_counter = temp_counter + 1

Stop-words-removed for ./Data/a1.txt  has:  972  of tokens
Stop-words-removed for ./Data/a2.txt  has:  2417  of tokens
Stop-words-removed for ./Data/a3.txt  has:  1096  of tokens
Stop-words-removed for ./Data/a4.txt  has:  1715  of tokens
Stop-words-removed for ./Data/a5.txt  has:  1787  of tokens
Stop-words-removed for ./Data/a6.txt  has:  910  of tokens
Stop-words-removed for ./Data/a7.txt  has:  842  of tokens
Stop-words-removed for ./Data/a8.txt  has:  960  of tokens
Stop-words-removed for ./Data/a9.txt  has:  827  of tokens
Stop-words-removed for ./Data/a10.txt  has:  727  of tokens


#2 ans: above is the total tokens in each stop-word doc

In [21]:
# form collection counter for the stop word doc collection
stopWrd_count = form_counter(stopWrd)
# form collection unique counter for the stop word doc collection
grp_stopWrd_count = form_GrpCounter(stopWrd_grp)

In [22]:
temp_counter = 0

for item in stopWrd_count:
    print("Stop-words-removed for ", data_dir[temp_counter], " has: ", len(item), " of unique tokens")
    temp_counter = temp_counter + 1

Stop-words-removed for  ./Data/a1.txt  has:  535  of unique tokens
Stop-words-removed for  ./Data/a2.txt  has:  1145  of unique tokens
Stop-words-removed for  ./Data/a3.txt  has:  636  of unique tokens
Stop-words-removed for  ./Data/a4.txt  has:  881  of unique tokens
Stop-words-removed for  ./Data/a5.txt  has:  965  of unique tokens
Stop-words-removed for  ./Data/a6.txt  has:  540  of unique tokens
Stop-words-removed for  ./Data/a7.txt  has:  524  of unique tokens
Stop-words-removed for  ./Data/a8.txt  has:  589  of unique tokens
Stop-words-removed for  ./Data/a9.txt  has:  491  of unique tokens
Stop-words-removed for  ./Data/a10.txt  has:  464  of unique tokens


#2 ans: above is the total unique tokens in each stop-word doc

In [23]:
len(stopWrd_grp)

12253

#2 ans: there are total 12826 tokens in the entire stop-words corpus

In [24]:
len(grp_stopWrd_count)

3678

#2 ans: there are total 3678 unique tokens in the entire stop-words corpus

////////////////////////////////////////////////// End #2 //////////////////////////////////////////////////

In [25]:
stemmer = PorterStemmer()

In [26]:
def stemming(tokens, stemmer):
    # stemming the files in the dataset
    # INPUT: merged stop-words array and stemmer
    # OUTPUT: stemming array
    stemmed = []
    
    for file in tokens:
        stemmed.append(grp_stemming(file, stemmer))
            
    return stemmed

In [27]:
def grp_stemming(tokens, stemmer):
    # stemming the files in the dataset
    # INPUT: merged stop-words and stemmer
    # OUTPUT: stemming array
    stemmed = []
    
    for item in tokens:
        stemmed.append(stemmer.stem(item))
            
    return stemmed

In [28]:
# stemming for the collection
stem = stemming(stopWrd, stemmer)

# stemming of the corpus
grp_stem = grp_stemming(stopWrd_grp, stemmer)

In [29]:
temp_counter = 0

for item in stem:
    print("Stemming for ", data_dir[temp_counter], " has: ", len(item), " of terms")
    temp_counter = temp_counter + 1

Stemming for  ./Data/a1.txt  has:  972  of terms
Stemming for  ./Data/a2.txt  has:  2417  of terms
Stemming for  ./Data/a3.txt  has:  1096  of terms
Stemming for  ./Data/a4.txt  has:  1715  of terms
Stemming for  ./Data/a5.txt  has:  1787  of terms
Stemming for  ./Data/a6.txt  has:  910  of terms
Stemming for  ./Data/a7.txt  has:  842  of terms
Stemming for  ./Data/a8.txt  has:  960  of terms
Stemming for  ./Data/a9.txt  has:  827  of terms
Stemming for  ./Data/a10.txt  has:  727  of terms


In [30]:
len(grp_stem)

12253

#3 ans: there are of 12253 total vocabulary size for the entire corpus

////////////////////////////////////////////////// End #3 //////////////////////////////////////////////////

In [31]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    lower = text.lower()
    trimmer = lower.translate(str.maketrans('','',string.punctuation))
    tokens = nltk.word_tokenize(trimmer)
    stopWrds = grp_stopWrd_trimmer(tokens)
    stems = stem_tokens(stopWrds, stemmer)
    return stems

In [32]:
TfidfVec = TfidfVectorizer(tokenizer=tokenize)

In [33]:
def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()

In [34]:
cossim = cos_similarity(dataset)

In [35]:
df = pd.DataFrame()

for x in range(0, len(data_dir)):
    df[data_dir[x]] = cossim[x]

df.index = data_dir

In [36]:
df

Unnamed: 0,./Data/a1.txt,./Data/a2.txt,./Data/a3.txt,./Data/a4.txt,./Data/a5.txt,./Data/a6.txt,./Data/a7.txt,./Data/a8.txt,./Data/a9.txt,./Data/a10.txt
./Data/a1.txt,1.0,0.191569,0.138107,0.183352,0.111067,0.145551,0.144206,0.110951,0.10933,0.137202
./Data/a2.txt,0.191569,1.0,0.210857,0.349566,0.19193,0.233374,0.212511,0.1663,0.154452,0.225532
./Data/a3.txt,0.138107,0.210857,1.0,0.178857,0.128805,0.158293,0.14241,0.125205,0.109086,0.143745
./Data/a4.txt,0.183352,0.349566,0.178857,1.0,0.182552,0.170691,0.180399,0.127223,0.119803,0.168167
./Data/a5.txt,0.111067,0.19193,0.128805,0.182552,1.0,0.146551,0.124117,0.102419,0.08469,0.119413
./Data/a6.txt,0.145551,0.233374,0.158293,0.170691,0.146551,1.0,0.147908,0.131506,0.106841,0.154747
./Data/a7.txt,0.144206,0.212511,0.14241,0.180399,0.124117,0.147908,1.0,0.118701,0.100469,0.139972
./Data/a8.txt,0.110951,0.1663,0.125205,0.127223,0.102419,0.131506,0.118701,1.0,0.087759,0.111443
./Data/a9.txt,0.10933,0.154452,0.109086,0.119803,0.08469,0.106841,0.100469,0.087759,1.0,0.102331
./Data/a10.txt,0.137202,0.225532,0.143745,0.168167,0.119413,0.154747,0.139972,0.111443,0.102331,1.0
