In [2]:
"""
Exit Code:
    0: Error opening file
"""

import os
from os import listdir
from os.path import isfile, join

import re
from collections import Counter

import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join

In [3]:
"""
Define formatted display function
Used to replace regular print function
"""
# Get name of an object
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

# Display with format
def display(items, func=None):
    print(namestr(items, globals()))
    for item in items:
        if func:
            item = func(item)
        print("     {0}".format(item))

# Test display
# test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
# display(test_dict)

In [4]:
""" Define function to filter out directories Results_1, Results_2, Results_3
@param Assume Input does NOT start with "./" or "../"
"""
def valid_direct(direct_name):
    assert isinstance(direct_name, str)
    
    # Exculde if name starts with "."
    if direct_name.startswith("."):
        return False
    
    # Test if name starts with ngram
    if re.match("Results_\d", direct_name):
        return True
    
    # Otherwise
    return False

In [5]:
"""
Get filtering results
"""
datapath = "./"
directories = sorted([direct for direct in os.listdir(datapath) if valid_direct(direct)])
print(directories)

['Results_1', 'Results_2', 'Results_3']


In [6]:
"""
Function checking valid filenames.
-- Only accepts files starts with "journal-article"
"""
def valid_filename(filename):
    if re.match("^journal-article-.+-ngram1.txt$", filename):
        return True
    return False
# print(valid_filename("journal-article-10.2307_43488821-ngram1.txt"))

In [7]:
""" Define function getting article ID from a filename by parsing pattern
Pattern: journal-article-10.2307_977118-ngram1.txt
"""
def parse_id(filename):
    id_number_lst = re.findall("_(.+)-ngram1.txt", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None
# print(parse_id("journal-article-10.2307_977503-ngram1.txt"))

In [68]:
"""
All Columns features for N-Gram [1-3]
"""
ngram_columns = [
    ["word_ngram_1", "freq_ngram_1"],
    ["word1_ngram_2", "word2_ngram_2", "freq_ngram_2"],
    ["word1_ngram_3", "word2_ngram_3", "word3_ngram_3", "freq_ngram_3"]
]

In [69]:
""" Merge Results for cleaned N-Gram data
1. Collect results from Results_1 directory and gather all information into a DataFrame
2. Assume all files in the Results_N directory is valid in content
3. Filenames will be checked
@param n_number: N for Results_N; n_number can be int -> [1, 2, 3]
"""
def collect_data(n_number):
    
    assert type(n_number) == int

    # directpath = "./Results_1/" .etc
    direct_folder = join(datapath + "Results_" + str(n_number) + "/")
    
    """
    DATA
    """
    # Initiate freq_list -> {{(words0): freq0}, {(words1), freq1}}
    freq_dict = {}
    
    # Print Number of Files Found in Directory
    num_of_files = len(listdir(direct_folder))
    print("{0} files found in directory '{1}'".format(num_of_files, direct_folder))
    
    # Keep track of the progress
    count = 0
    progress = list(range(0, num_of_files, num_of_files // 99))
    
    data = []
    
    # Iterate through the files in the directory
    for filename in listdir(direct_folder):
        
        direct_file = join(direct_folder, filename)
        
        # Validate if is file and if valid filename
        if isfile(direct_file) and valid_filename(filename):
            # Open file
            try:
                file_open = open(direct_file, mode="r", encoding="utf-8")
            except Exception as e:
                print("Error opening file {0}".format(filename))
                print("Error message: <{0}>".format(e))
                exit(0)

            # Read by line
            # line -> "word1 word2 word3 5"
            for line in file_open:
                
                assert isinstance(line, str)
                
                # pair -> "["word1", "word2", "word3", "5"]
                pair = line.strip().split()
                assert len(pair) >= 2

                # Separate word/freq
                # Words in tuple form since it will be the key in file_dict
                # freq will be a int digit as value in file_dict 
                words, freq = pair[:-1], pair[-1]

                # Initiate dataline
                dataline = []
                
                # Save into dataline
                for word in words:
                    dataline.append(word)
                dataline.append(freq)
                
                # Save dataline into data
                data.append(dataline)

            # Close reading file
            file_open.close()

            # Track Progress
            count += 1
            if len(progress) > 0 and count >= progress[0]:
                print("Progress: {}%".format(100 - len(progress)))
                progress.pop(0)

    """
    Columns
    """
    # "-1" for matching correct index
    columns = ngram_columns[n_number - 1] 

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)
    
    return dataframe


In [70]:
""" 
DataFrame Creation for N-Gram [1] 
"""
dataframe = collect_data(1)

print(dataframe.head())
print(dataframe.size)

13382 files found in directory './Results_1/'
Progress: 0%
Progress: 1%
Progress: 2%
Progress: 3%
Progress: 4%
Progress: 5%
Progress: 6%
Progress: 7%
Progress: 8%
Progress: 9%
Progress: 10%
Progress: 11%
Progress: 12%
Progress: 13%
Progress: 14%
Progress: 15%
Progress: 16%
Progress: 17%
Progress: 18%
Progress: 19%
Progress: 20%
Progress: 21%
Progress: 22%
Progress: 23%
Progress: 24%
Progress: 25%
Progress: 26%
Progress: 27%
Progress: 28%
Progress: 29%
Progress: 30%
Progress: 31%
Progress: 32%
Progress: 33%
Progress: 34%
Progress: 35%
Progress: 36%
Progress: 37%
Progress: 38%
Progress: 39%
Progress: 40%
Progress: 41%
Progress: 42%
Progress: 43%
Progress: 44%
Progress: 45%
Progress: 46%
Progress: 47%
Progress: 48%
Progress: 49%
Progress: 50%
Progress: 51%
Progress: 52%
Progress: 53%
Progress: 54%
Progress: 55%
Progress: 56%
Progress: 57%
Progress: 58%
Progress: 59%
Progress: 60%
Progress: 61%
Progress: 62%
Progress: 63%
Progress: 64%
Progress: 65%
Progress: 66%
Progress: 67%
Progress: 68

# Get a Test File (NGram 1)

In [71]:
import zipfile
mypath = "./"
zip_file_name = "test_files"
zip_file = zipfile.ZipFile(mypath + zip_file_name + ".zip")
file_name_list = zip_file.namelist()
print(len(file_name_list))

159935


# Filter filenames for Test Files 

In [72]:
N_number = 1
def filter_by_filename(files_list):
    """Function filtering files by filenames.
    Only accepts files starts with "journal-article"
    """
    filtered_list = []
    mask_match = "ngram" + str(N_number) + "/journal-article"
    for filename in files_list:
        # Check if the filename starts with "journal-article"
        assert isinstance(filename, str)
        # Check the first 20 characters of the file name
        if filename.startswith(mask_match, 0, len(mask_match)):
            filtered_list.append(filename)
    return filtered_list

# Run "filter_by_filename" for current directory
filtered_list = filter_by_filename(file_name_list)
print("{0} files remained after filename filtering. ".format(len(filtered_list)))

36267 files remained after filename filtering. 


# Check word validity for Test Files

In [73]:
# Functions checking word attributes (single-letter, starts/ends with numebr)
def is_single_letter(word):
    assert isinstance(word, str)
    return len(word) <= 1

def starts_with_number(word):
    assert isinstance(word, str)
    try:
        return word[0].isdigit()
    except:
        return False

def ends_with_number(word):
    assert isinstance(word, str)
    try:
        return word[len(word) - 1].isdigit()
    except:
        return False

# Summary of check functions
check_funcs = [
    is_single_letter, 
    starts_with_number, 
    ends_with_number,
]

# Create DataFrame for Test File

In [78]:
# Get a test file 
test_file_name = filtered_list[0]

"""
Data
"""
# Open file
try:
    file_open = zip_file.open(test_file_name, mode="r")
except IOError:
    print("Error opening file {0}".format(file))
    exit(0)

data_test = []    

# Read by line
for line in file_open:
    # Line decode using UTF-8
    line = line.decode("utf-8")
    assert isinstance(line, str)
    pair = line.strip().split()

    # Separate word/freq
    word, freq = pair
    assert isinstance(word, str)

    # Filter by word's attribute
    check_results = [check_func(word) for check_func in check_funcs]
    if any(check_results):
        continue

    # Update data for DataFrame
    data_test.append([word, freq])
    
"""
Columns
"""
columns = ["word", "freq"]

"""
Index
"""
index = list(range(len(data_test)))

"""
DataFrame
"""
dataframe_test = pd.DataFrame(data_test, columns=columns, index=index)

# Mapping Process

In [80]:
import time
match_count = 0

# Keep track of the progress
count = 0
progress = np.linspace(0, dataframe_test.size, num=100).tolist()

# Iterate through the test's dataframe
for index, row in dataframe_test.iterrows():
    
    word_test, freq_test = row["word"], row["freq"]
    
    print("\nword_test: {}".format(word_test))
    
    # Check if the word exist in the training set
    # Count time for reference
    start = time.time()
    if dataframe["word_ngram_1"].str.contains(word_test).any():
        match_count += 1
        print("==== Match! ====")
    else:
        print("==== Not Match ====")
    end = time.time()
    
    print("Time: {}".format(end - start))
    
    # Track Progress
    count += 1
    if len(progress) > 0 and count >= progress[0]:
        print("Progress: {}%".format(100 - len(progress)))
        progress.pop(0)

match_rate = match_count / dataframe_test.shape[0]

print(match_rate)


word_test: we
==== Match! ====
Time: 14.061488628387451
Progress: 0%

word_test: history
==== Match! ====
Time: 14.3613121509552

word_test: oral
==== Match! ====
Time: 14.325449466705322

word_test: all
==== Match! ====
Time: 14.232678651809692

word_test: advice
==== Match! ====
Time: 14.303768396377563
Progress: 1%

word_test: evidence
==== Match! ====
Time: 14.406672477722168

word_test: experiences
==== Match! ====
Time: 13.9123854637146

word_test: first
==== Match! ====
Time: 14.054575681686401

word_test: have
==== Match! ====
Time: 14.344773292541504
Progress: 2%

word_test: its
==== Match! ====
Time: 14.399255990982056

word_test: should
==== Match! ====
Time: 14.125980615615845

word_test: were
==== Match! ====
Time: 14.06918215751648

word_test: also
==== Match! ====
Time: 14.55257534980774

word_test: assessment
==== Match! ====
Time: 14.230801582336426
Progress: 3%

word_test: book
==== Match! ====
Time: 13.951858043670654

word_test: both
==== Match! ====
Time: 14.28754

==== Match! ====
Time: 14.058716773986816
Progress: 29%

word_test: make
==== Match! ====
Time: 14.367667198181152

word_test: man
==== Match! ====
Time: 14.180984020233154

word_test: map
==== Match! ====
Time: 13.97757363319397

word_test: memories
==== Match! ====
Time: 14.109720706939697

word_test: men
==== Match! ====
Time: 14.34794807434082
Progress: 30%

word_test: message
==== Match! ====
Time: 14.15237307548523

word_test: most
==== Match! ====
Time: 14.025007724761963

word_test: nation
==== Match! ====
Time: 14.474676609039307

word_test: national
==== Match! ====
Time: 14.246641159057617
Progress: 31%

word_test: newsworthy
==== Match! ====
Time: 13.873561382293701

word_test: nigel
==== Match! ====
Time: 14.318544626235962

word_test: north
==== Match! ====
Time: 14.478025674819946

word_test: nothing
==== Match! ====
Time: 14.378855228424072

word_test: organised
==== Match! ====
Time: 13.87440013885498
Progress: 32%

word_test: other
==== Match! ====
Time: 14.4128730297