In [3]:
"""
Exit Code:
    0: Error opening file
"""

import os
from os import listdir
from os.path import isfile, join

import re
from collections import Counter

import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join

In [4]:
"""
Define formatted display function
Used to replace regular print function
"""
# Get name of an object
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

# Display with format
def display(items, func=None):
    print(namestr(items, globals()))
    for item in items:
        if func:
            item = func(item)
        print("     {0}".format(item))

# Test display
test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
display(test_dict)

['test_dict']
     A
     B


In [5]:
""" Define function to filter out directories Results_1, Results_2, Results_3
@param Assume Input does NOT start with "./" or "../"
"""
def valid_direct(direct_name):
    assert isinstance(direct_name, str)
    
    # Exculde if name starts with "."
    if direct_name.startswith("."):
        return False
    
    # Test if name starts with ngram
    if re.match("Results_\d", direct_name):
        return True
    
    # Otherwise
    return False

In [6]:
"""
Get filtering results
"""
datapath = "./"
directories = sorted([direct for direct in os.listdir(datapath) if valid_direct(direct)])
print(directories)

['Results_1', 'Results_2', 'Results_3']


In [7]:
"""
Function checking valid filenames.
-- Only accepts files starts with "journal-article"
"""
def valid_filename(filename):
    if re.match("^journal-article-.+-ngram1.txt$", filename):
        return True
    return False
# print(valid_filename("journal-article-10.2307_43488821-ngram1.txt"))

In [8]:
""" Define function getting article ID from a filename by parsing pattern
Pattern: journal-article-10.2307_977118-ngram1.txt
"""
def parse_id(filename):
    id_number_lst = re.findall("_(.+)-ngram1.txt", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None
# print(parse_id("journal-article-10.2307_977503-ngram1.txt"))

In [9]:
"""
All Columns features for N-Gram [1-3]
"""
ngram_columns = [
    ["word_ngram_1", "freq_ngram_1"],
    ["word1_ngram_2", "word2_ngram_2", "freq_ngram_2"],
    ["word1_ngram_3", "word2_ngram_3", "word3_ngram_3", "freq_ngram_3"]
]

In [14]:
""" Merge Results for cleaned N-Gram data
1. Collect results from Results_1 directory and gather all information into a DataFrame
2. Assume all files in the Results_N directory is valid in content
3. Filenames will be checked
@param n_number: N for Results_N; n_number can be int -> [1, 2, 3]
"""
def collect_data(n_number):
    
    assert type(n_number) == int

    # directpath = "./Results_1/" .etc
    direct_folder = join(datapath + "Results_" + str(n_number) + "/")
    
    """
    DATA
    """
    # Initiate freq_list -> {{(words0): freq0}, {(words1), freq1}}
    freq_dict = {}
    
    # Print Number of Files Found in Directory
    num_of_files = len(listdir(direct_folder))
    print("{0} files found in directory '{1}'".format(num_of_files, direct_folder))
    
    # Keep track of the progress
    count = 0
    progress = list(range(0, num_of_files, num_of_files // 99))
    
    # Iterate through the files in the directory
    for filename in listdir(direct_folder):
        
        direct_file = join(direct_folder, filename)
        
        # Validate if is file and if valid filename
        if isfile(direct_file) and valid_filename(filename):
            # Open file
            try:
                file_open = open(direct_file, mode="r", encoding="utf-8")
            except Exception as e:
                print("Error opening file {0}".format(filename))
                print("Error message: <{0}>".format(e))
                exit(0)

            # Read by line
            # line -> "word1 word2 word3 5"
            for line in file_open:
                
                assert isinstance(line, str)
                
                # pair -> "["word1", "word2", "word3", "5"]
                pair = line.strip().split()
                assert len(pair) >= 2

                # Separate word/freq
                # Words in tuple form since it will be the key in file_dict
                # freq will be a int digit as value in file_dict 
                words, freq = tuple(pair[:-1]), int(pair[-1])

                # Check if words already in file_dict before adding value into it
                if words in freq_dict.keys():
                    freq_dict[words] += freq
                else:
                    freq_dict[words] = freq

            # Close reading file
            file_open.close()

            # Track Progress
            count += 1
            if len(progress) > 0 and count >= progress[0]:
                print("Progress: {}%".format(100 - len(progress)))
                progress.pop(0)

    data = []
    for data_pair in freq_dict.items():
        # Data in a line
        dataline = []
        # Parse out words and freq
        words, freq = data_pair
        # Append item respectively into dataline
        # dataline -> [word1, word2, word3, freq]
        for word in words:
            dataline.append(word)
        dataline.append(freq)
        # Append dataline into data
        data.append(dataline)
            
    """
    Columns
    """
    # "-1" for matching correct index
    columns = ngram_columns[n_number - 1] 

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)
    
    return dataframe


In [15]:
""" 
DataFrame Creation for N-Gram [1] 
"""
dataframe1 = collect_data(1)

print(dataframe1.head())

13382 files found in directory './Results_1/'
Progress: 0%
Progress: 1%
Progress: 2%
Progress: 3%
Progress: 4%
Progress: 5%
Progress: 6%
Progress: 7%
Progress: 8%
Progress: 9%
Progress: 10%
Progress: 11%
Progress: 12%
Progress: 13%
Progress: 14%
Progress: 15%
Progress: 16%
Progress: 17%
Progress: 18%
Progress: 19%
Progress: 20%
Progress: 21%
Progress: 22%
Progress: 23%
Progress: 24%
Progress: 25%
Progress: 26%
Progress: 27%
Progress: 28%
Progress: 29%
Progress: 30%
Progress: 31%
Progress: 32%
Progress: 33%
Progress: 34%
Progress: 35%
Progress: 36%
Progress: 37%
Progress: 38%
Progress: 39%
Progress: 40%
Progress: 41%
Progress: 42%
Progress: 43%
Progress: 44%
Progress: 45%
Progress: 46%
Progress: 47%
Progress: 48%
Progress: 49%
Progress: 50%
Progress: 51%
Progress: 52%
Progress: 53%
Progress: 54%
Progress: 55%
Progress: 56%
Progress: 57%
Progress: 58%
Progress: 59%
Progress: 60%
Progress: 61%
Progress: 62%
Progress: 63%
Progress: 64%
Progress: 65%
Progress: 66%
Progress: 67%
Progress: 68

In [None]:
""" 
DataFrame Creation for N-Gram [2] 
"""
dataframe2 = collect_data(2)

In [None]:
""" 
DataFrame Creation for N-Gram [3] 
"""
dataframe3 = collect_data(3)