In [1]:
"""
Exit Code:
    0: Error opening file
"""

import os
from os import listdir
from os.path import isfile, join

from parse import *
from collections import Counter

import pandas as pd
import numpy as np

In [2]:
"""
Define formatted display function
Used to replace regular print function
"""
# Get name of an object
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

# Display with format
def display(items, func=None):
    print(namestr(items, globals()))
    for item in items:
        if func:
            item = func(item)
        print("     {0}".format(item))

# Test display
test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
display(test_dict)

['test_dict']
     A
     B


In [3]:
# Base on each folder (ngram-1, ngram-2, ngram-3).
# Create DataFrame: 
#    art_id; word_ngram_1; freq_ngram_1; word1_ngram_2; word2_ngram_2; freq_ngram_2; 
#    word1_ngram_3; word2_ngram_3; word3_ngram_3; freq_ngram_3; 


In [4]:
"""
Define function to filter out directories ngram-1, ngram-2, n-gram3
"""
def valid_direct(direct_name):
    assert isinstance(direct_name, str)
    
    # Eliminate the "../" in the front if there is one
    if direct_name.startswith("../"):
        direct_name = direct_name[3:]
    
    # Exculde if name starts with "."
    if direct_name.startswith("."):
        return False
    
    # Test if name starts with ngram
    if direct_name.startswith("ngram"):
        return True
    
    # Otherwise
    return False


In [5]:
"""
Get filtering results
"""
datapath = "../"
directories = sorted([direct for direct in os.listdir(datapath) if valid_direct(direct)])
print(directories)

['ngram1', 'ngram2', 'ngram3']


In [6]:
"""
Function filtering files by filenames.
-- Only accepts files starts with "journal-article"
"""
def filter_by_filename(files_list):
    filtered_list = []
    for filename in files_list:
        # Check if the filename starts with "journal-article"
        assert isinstance(filename, str)
        # Check the first 20 characters of the file name
        if filename.startswith("journal-article", 0, 20):
            filtered_list.append(filename)
    return filtered_list

In [7]:
"""
Define function getting article ID from a filename by parsing pattern
"""
def parse_id(filename, pattern):
    return parse(pattern, filename)["art_id"]

In [8]:
"""
Define function extract article IDs from filenames
-- parsing tricks learned from "https://pypi.org/project/parse/"
"""
def extract_id(files_list, direct):
    # direct -> "ngram1" || "ngram2" || "ngram3"
    
    art_id_lst = []
    
    # Define Pattern
    pattern = "journal-article-10.2307_{art_id}-" + direct + ".txt"
    
    # Match filenames with pattern
    for filename in files_list:
        art_id_lst.append(parse_id(filename, pattern))
    
    return art_id_lst

In [9]:
"""
Define function extract all article IDs under a directory
"""
def get_id_lst(direct):
    directpath = join(datapath, direct)
    
    # Get files under a directory
    files_list = [file for file in listdir(directpath) if isfile(join(directpath, file))]
    
    # Filter files by filenames
    files_list = filter_by_filename(files_list)
    
    # Parsing the filenames to extract IDs
    art_id_lst = extract_id(files_list, direct)
    
    return art_id_lst

In [10]:
"""
Get ID lists for each ngram data set
"""
art_id_lsts = [get_id_lst(direct) for direct in directories]
display(art_id_lsts, sorted)

['art_id_lsts']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']


In [11]:
"""
Check if all article IDs co-exist in three directorys
"""
# Define a function comparing list
def compare_lst(lst1, lst2):
    return Counter(lst1) == Counter(lst2)

# Check if all ngram data sets share same set of article IDs
check_co_exist = art_id_lsts and all(compare_lst(art_id_lsts[0], art_id_lst) for art_id_lst in art_id_lsts)

# Check result
print(check_co_exist)

True


In [12]:
"""
Collect ngram cleaning results by our indivisually analyzed results
Read data from "Result_journal-.....txt"
"""
# ngram = [ngram_1, ngram_2, ngram_3]
# ngram_1 = [words_ngram_1, freq_ngram_1]
# ...

'\nCollect ngram cleaning results by our indivisually analyzed results\nRead data from "Result_journal-.....txt"\n'

In [13]:
"""
Filter out Result files
-- Only save those starts with "Result_journal-article"
"""
def filter_for_results(files_list):
    filtered_list = []
    for filename in files_list:
        
        # Check if the filename starts with "Result_journal-article"
        assert isinstance(filename, str)
        
        # Check the first 20 characters of the file name
        if filename.startswith("Result_journal-article"):
            filtered_list.append(filename)
            
    return filtered_list

In [14]:
"""
Define a function to collect data from ngram results
-- Output [words_ngram_X, freq_ngram_X]
"""
def collect_data(direct):
    directpath = join(datapath, direct)
    
    # Get files under a directory
    files_list = [file for file in listdir(directpath) if isfile(join(directpath, file))]
    
    # Filter files by filenames
    filtered_list = filter_for_results(files_list)
    
    # Sum of freq lists
    # freq_lists -> {ID1: freq_list1, ID2: freq_list2}
    freq_lists = {}
    
    # Iterate through files
    for filename in filtered_list:
        # Open file
        try:
            file_open = open(join(directpath, filename), mode="r")
        except Exception as e:
            print("Error opening file {0}".format(filename))
            print("Error message: <{0}>".format(e))
            exit(0)
        
        # Initiate freq_list -> [[words0, freq0], [words1, freq1]]
        freq_list = []
        
        # Read by line
        for line in file_open:
            # line -> "word1 word2 word3 5"
            assert isinstance(line, str)
            
            # pair -> "["word1", "word2", "word3", "5"]
            pair = line.strip().split()
            assert len(pair) >= 2

            # Separate word/freq
            words, freq = pair[:-1], pair[-1]
            assert freq.isdigit()

            # Append new pair to freq_list
            freq_list.append([words, freq])
        
        # Close reading file
        file_open.close()
        
        # Get article/file ID
        pattern = "Result_journal-article-10.2307_{art_id}-" + direct + ".txt"
        art_id = parse_id(filename, pattern)
        
        # Append to overall list
        freq_lists.update({art_id : freq_list})
            
    return freq_lists

In [15]:
"""
Collect data from ngrams
"""
# Sequence is very important here since it matters the sequence we save data in "ngrams"
assert directories == sorted(directories)

ngrams = []
for direct in directories:
    # Collect data
    data = collect_data(direct)
    assert len(data) > 0

    ngrams.append(data)

"""
Test freq_lists
"""
# index = 1
# for key in ngrams[index]:
#     print(key)
#     display(ngrams[index][key])

'\nTest freq_lists\n'

In [16]:
"""
Create DataFrames:
    art_id; word_ngram_1; freq_ngram_1; 
    art_id; word1_ngram_2; word2_ngram_2; freq_ngram_2; 
    art_id; word1_ngram_3; word2_ngram_3; word3_ngram_3; freq_ngram_3; 
"""        

'\nCreate DataFrames:\n    art_id; word_ngram_1; freq_ngram_1; \n    art_id; word1_ngram_2; word2_ngram_2; freq_ngram_2; \n    art_id; word1_ngram_3; word2_ngram_3; word3_ngram_3; freq_ngram_3; \n'

In [17]:
"""
Define function to create DataFrame for N-Gram-X
"""
def create_DataFrame(columns):
    # An API variable indicating which ngram should be selected from ngrams
    # The calculation is because we eliminate the art_id and freq_ngram_x and then -1, totally 3
    ngram_num = len(columns) - 3

    """
    DATA
    """
    # Data
    data = []

    # Get article ID list
    # Review that we get art_id_lst in this way only if we are sure that ngram1/2/3 share same articles (IDs)
    assert check_co_exist
    art_id_lst = art_id_lsts[0]

    # Generate data by lines
    for art_id in art_id_lst:

        # Make sure the sequence is correct that matches the dataframe column titles
        # ngram -> {ID1: freq_list1, ID2: freq_list2}
        # freq_list -> [[words0, freq0], [words1, freq1]]
        ngram = ngrams[ngram_num]
        assert art_id in ngram.keys()

        # Get freq_list
        freq_list = ngram[art_id]
        assert len(freq_list) > 0
        assert len(freq_list[0]) >= 2

        # Treat unigram and others seperately
        for pair in freq_list:
            # dataline -> [art_id, word_ngram_1, freq_ngram_1] as section title comment issues
            dataline = []

            # Add art_id to dataline
            dataline.append(art_id)

            # pair -> [[words], freq]
            assert len(pair) == 2

            # Separate word/freq
            words, freq = pair[0], pair[1]
            assert freq.isdigit()
            for word in words:
                dataline.append(word)
            dataline.append(freq)
            
            # Now the dataline should be complete
            data.append(dataline)

    """
    Columns
    """
    columns = columns

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)

    """
    Test DataFrame
    """
    print(dataframe.head())

In [18]:
"""
All Columns features for N-Gram [1-3]
"""
ngram_columns = [
    ["art_id", "word_ngram_1", "freq_ngram_1"],
    ["art_id", "word1_ngram_2", "word2_ngram_2", "freq_ngram_2"],
    ["art_id", "word1_ngram_3", "word2_ngram_3", "word3_ngram_3", "freq_ngram_3"]
]

In [19]:
"""
DataFrame Creation for N-Gram [1-3]
"""
dataframe1 = create_DataFrame(ngram_columns[0])
dataframe2 = create_DataFrame(ngram_columns[1])
dataframe3 = create_DataFrame(ngram_columns[2])

   art_id word_ngram_1 freq_ngram_1
0  145292    licensing           58
1  145292      written           40
2  145292  examination           32
3  145292         test           32
4  145292      workers           27
   art_id word1_ngram_2 word2_ngram_2 freq_ngram_2
0  145292       written          test           17
1  145292  occupational     licensing           12
2  145292             i             i           11
3  145292       written   examination           11
4  145292     licensing  examinations           10
   art_id word1_ngram_3 word2_ngram_3 word3_ngram_3 freq_ngram_3
0  145292             i             i             i            9
1  145292       journal         human     resources            6
2  145292       written     licensing  examinations            4
3  145292       attempt        obtain       license            2
4  145292    attributes     important       passing            2
