In [1]:
"""
The file analylizes the .txt files (which contains ngram2 data) in ngram2 folder 
in a zip file and output in a folder 
"""

import pandas as pd
import numpy as np
import zipfile

# Clarify the mission N number for this .ipynb
N_number = 2

In [2]:
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

def display(items):
    print(namestr(items, globals()))
    for item in items:
        print("     {0}".format(item))

# Test display
test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
display(test_dict)

['test_dict']
     A
     B


In [3]:
from os import listdir
from os.path import isfile, join

mypath = "./"
zip_file_name = "receipt-id-773931-part-001"
zip_file = zipfile.ZipFile(mypath + zip_file_name + ".zip")
file_name_list = zip_file.namelist()
# Should return 4 folders if testing on ASG Zip
print("{0} files detected under current directory. ".format(len(file_name_list)))

53528 files detected under current directory. 


In [5]:
def filter_by_filename(files_list):
    """Function filtering files by filenames.
    Only accepts files starts with "journal-article"
    """
    filtered_list = []
    mask_match = "ngram" + str(N_number) + "/journal-article"
    for filename in files_list:
        # Check if the filename starts with "journal-article"
        assert isinstance(filename, str)
        # Check the first 20 characters of the file name
        if filename.startswith(mask_match, 0, len(mask_match)):
            filtered_list.append(filename)
    return filtered_list

# Run "filter_by_filename" for current directory
filtered_list = filter_by_filename(file_name_list)
print("{0} files remained after filename filtering. ".format(len(filtered_list)))

13382 files remained after filename filtering. 


In [6]:
# Functions checking word attributes (single-letter, starts/ends with numebr)
def is_single_letter(word):
    assert isinstance(word, str)
    return len(word) <= 1

def starts_with_number(word):
    assert isinstance(word, str)
    try:
        return word[0].isdigit()
    except:
        return False

def ends_with_number(word):
    assert isinstance(word, str)
    try:
        return word[len(word) - 1].isdigit()
    except:
        return False

# Summary of check functions
check_funcs = [
#     is_single_letter, # Only applies for 1-gram
    starts_with_number, 
    ends_with_number,
]

# Summary of checking mechanisms
def check_words(words):
    for word in words:
        check_result = [check_func(word) for check_func in check_funcs]
        if any(check_result):
            return True
    return False

In [7]:
file_0 = filtered_list[0]
try:
    journal_0 = zip_file.open(file_0, mode="r")
except IOError:
    print("Error opening file {0}".format(file_0))
    exit(0)
    
# Track the meaningful dictonaries
freq_list = []

# Read file line by line
for line in journal_0:
    line = line.decode("ascii") 
    
    assert isinstance(line, str)
    pair = line.strip().split()
    
    assert len(pair) == 3
    
    # Separate word/freq
    words, freq = pair[:-1], pair[-1]
    assert len(words) == 2
    assert freq.isdigit()
    
    # Filter by word's attribute
    if check_words(words):
        continue
    freq_list.append([words, freq])

journal_0.close()
display(freq_list)

['freq_list']
     [['information', 'systems'], '27']
     [['p', 'i'], '21']
     [['complex', 'environments'], '19']
     [['planning', 'information'], '16']
     [['new', 'york'], '15']
     [['person', 'centered'], '15']
     [['simple', 'environments'], '15']
     [['information', 'system'], '14']
     [['risk', 'omission'], '13']
     [['i', 'i'], '12']
     [['infor', 'mation'], '12']
     [['p', 'r'], '12']
     [['r', 'p'], '12']
     [['i', 'p'], '11']
     [['low', 'risk'], '11']
     [['r', 'r'], '11']
     [['information', 'sets'], '10']
     [['planning', 'problems'], '10']
     [['information', 'set'], '9']
     [['management', 'review'], '9']
     [['plan', 'ning'], '9']
     [['can', 'used'], '8']
     [['informa', 'tion'], '8']
     [['nonperson', 'centered'], '8']
     [['planning', 'problem'], '8']
     [['simple', 'complex'], '8']
     [['centered', 'information'], '7']
     [['planning', 'methods'], '7']
     [['planning', 'process'], '7']
     [['preferred', 'inf

In [8]:
"""
Create Dataframe
""" 
# Data
data = []
for words, freq in freq_list:
    assert len(words) == 2
    data.append([words[0], words[1], freq])

# Columns
columns = ["word1", "word2", "freq"]

# Index
index = list(range(len(freq_list)))

# DataFrame
dataframe = pd.DataFrame(data, columns=columns, index=index)

# Print Test
dataframe.head()

Unnamed: 0,word1,word2,freq
0,information,systems,27
1,p,i,21
2,complex,environments,19
3,planning,information,16
4,new,york,15


In [None]:
"""
Run filtering for each file
"""
progress = list(range(0, len(filtered_list), len(filtered_list) // 99))

for file_num in range(0, len(filtered_list)):
    
    file = filtered_list[file_num]
    # Open file
    try:
        file_open = zip_file.open(file, mode="r")
    except IOError:
        print("Error opening file {0}".format(file))
        exit(0)
    
    # Create output file
    output_name = mypath + "Results_2/" + file[len("/ngram" + str(N_number)):]
    # Note: Output file encoding set to UTF-8
    output_file = open(output_name, mode="w+", encoding="utf-8")
    
    # Track for max length word
    # Used for formatting output data
    max_length = [float("-inf")] * 2
        
    # Initiate freq_list -> [[words0, freq0], [words1, freq1]]
    freq_list = []
    
    # Read by line
    for line in file_open:
        # Line decode using UTF-8
        line = line.decode("utf-8")
        assert isinstance(line, str)
        pair = line.strip().split()
    
        # Separate word/freq
        words, freq = pair[:-1], pair[-1]
        assert len(words) == 2
        assert freq.isdigit()
        
        # Filter by word's attribute
        if check_words(words):
            continue
        
        # Update max_length
        for i in range(len(words)):
            if len(words[i]) > max_length[i]:
                max_length[i] = len(words[i])
            
            
        # Append new pair to freq_list
        freq_list.append([words, freq])
    
    # Close reading file
    file_open.close()
    
    # Create Dataframe
    # Data
    data = []
    for words, freq in freq_list:
        assert len(words) == 2
        data.append([words[0], words[1], freq])

    # Columns
    columns = ["word1", "word2", "freq"]

    # Index
    index = list(range(len(freq_list)))

    # DataFrame
    dataframe = pd.DataFrame(data, columns=columns, index=index)
    
    # Write to output file
    for words, freq in freq_list:
        output_file.write(words[0].ljust(max_length[0] + 5))
        output_file.write(words[1].ljust(max_length[1] + 5))
        output_file.write(freq + "\n")
    
    # Close output_file
    output_file.close()

    # Track Progress
    if len(progress) > 0 and file_num >= progress[0]:
        print("Progress: {}%".format(100 - len(progress)))
        progress.pop(0)
        
# Print Sample Output
print(dataframe.head())