In [1]:
"""
Exit Code:
    0: Error opening file
"""

import os
from os import listdir
from os.path import isfile, join

from parse import *
from collections import Counter

import pandas as pd
import numpy as np

In [2]:
"""
Define formatted display function
Used to replace regular print function
"""
# Get name of an object
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

# Display with format
def display(items, func=None):
    print(namestr(items, globals()))
    for item in items:
        if func:
            item = func(item)
        print("     {0}".format(item))

# Test display
test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
display(test_dict)

['test_dict']
     A
     B


In [3]:
# Base on each folder (ngram-1, ngram-2, ngram-3).
# Create DataFrame: 
#    art_id; word_ngram_1; freq_ngram_1; word1_ngram_2; word2_ngram_2; freq_ngram_2; 
#    word1_ngram_3; word2_ngram_3; word3_ngram_3; freq_ngram_3; 


In [4]:
"""
Define function to ilter out directories ngram-1, ngram-2, n-gram3
"""
def valid_direct(direct_name):
    assert isinstance(direct_name, str)
    
    # Eliminate the "../" in the front if there is one
    if direct_name.startswith("../"):
        direct_name = direct_name[3:]
    
    # Exculde if name starts with "."
    if direct_name.startswith("."):
        return False
    
    # Test if name starts with ngram
    if direct_name.startswith("ngram"):
        return True
    
    # Otherwise
    return False


In [5]:
"""
Get filtering results
"""
datapath = "../"
directories = sorted([direct for direct in os.listdir(datapath) if valid_direct(direct)])
print(directories)

['ngram1', 'ngram2', 'ngram3']


In [6]:
"""
Function filtering files by filenames.
-- Only accepts files starts with "journal-article"
"""
def filter_by_filename(files_list):
    filtered_list = []
    for filename in files_list:
        # Check if the filename starts with "journal-article"
        assert isinstance(filename, str)
        # Check the first 20 characters of the file name
        if filename.startswith("journal-article", 0, 20):
            filtered_list.append(filename)
    return filtered_list

In [21]:
"""
Define function getting article ID from a filename by parsing pattern
"""
def parse_id(filename, pattern):
    return parse(pattern, filename)["art_id"]

In [22]:
"""
Define function extract article IDs from filenames
-- parsing tricks learned from "https://pypi.org/project/parse/"
"""
def extract_id(files_list, direct):
    # direct -> "ngram1" || "ngram2" || "ngram3"
    
    art_id_lst = []
    
    # Define Pattern
    pattern = "journal-article-10.2307_{art_id}-" + direct + ".txt"
    
    # Match filenames with pattern
    for filename in files_list:
        art_id_lst.append(parse_id(filename, pattern))
    
    return art_id_lst

In [23]:
"""
Define function extract all article IDs under a directory
"""
def get_id_lst(direct):
    directpath = join(datapath, direct)
    
    # Get files under a directory
    files_list = [file for file in listdir(directpath) if isfile(join(directpath, file))]
    
    # Filter files by filenames
    files_list = filter_by_filename(files_list)
    
    # Parsing the filenames to extract IDs
    art_id_lst = extract_id(files_list, direct)
    
    return art_id_lst

In [24]:
"""
Get ID lists for each ngram data set
"""
art_id_lsts = [get_id_lst(direct) for direct in directories]
display(art_id_lsts, sorted)

['art_id_lsts']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']
     ['145203', '145208', '145211', '145213', '145215', '145216', '145218', '145221', '145224', '145226', '145227', '145230', '145231', '145233', '145234', '145235', '145242', '145286', '145288', '145292']


In [25]:
"""
Check if all article IDs co-exist in three directorys
"""
# Define a function comparing list
def compare_lst(lst1, lst2):
    return Counter(lst1) == Counter(lst2)

# Check if all ngram data sets share same set of article IDs
check_co_exist = art_id_lsts and all(compare_lst(art_id_lsts[0], art_id_lst) for art_id_lst in art_id_lsts)

# Check result
print(check_co_exist)

True


In [26]:
"""
Collect ngram cleaning results by our indivisually analyzed results
Read data from "Result_journal-.....txt"
"""
# ngram = [ngram_1, ngram_2, ngram_3]
# ngram_1 = [words_ngram_1, freq_ngram_1]
# ...

'\nCollect ngram cleaning results by our indivisually analyzed results\nRead data from "Result_journal-.....txt"\n'

In [27]:
"""
Filter out Result files
-- Only save those starts with "Result_journal-article"
"""
def filter_for_results(files_list):
    filtered_list = []
    for filename in files_list:
        
        # Check if the filename starts with "Result_journal-article"
        assert isinstance(filename, str)
        
        # Check the first 20 characters of the file name
        if filename.startswith("Result_journal-article"):
            filtered_list.append(filename)
            
    return filtered_list

In [52]:
"""
Define a function to collect data from ngram results
-- Output [words_ngram_X, freq_ngram_X]
"""
def collect_data(direct):
    directpath = join(datapath, direct)
    
    # Get files under a directory
    files_list = [file for file in listdir(directpath) if isfile(join(directpath, file))]
    
    # Filter files by filenames
    filtered_list = filter_for_results(files_list)
    
    # Sum of freq lists
    # freq_lists -> {ID1: freq_list1, ID2: freq_list2}
    freq_lists = {}
    
    # Iterate through files
    for filename in filtered_list:
        # Open file
        try:
            file_open = open(join(directpath, filename), mode="r")
        except Exception as e:
            print("Error opening file {0}".format(filename))
            print("Error message: <{0}>".format(e))
            exit(0)
        
        # Initiate freq_list -> [[words0, freq0], [words1, freq1]]
        freq_list = []
        
        # Read by line
        for line in file_open:
            # line -> "word1 word2 word3 5"
            assert isinstance(line, str)
            
            # pair -> "["word1", "word2", "word3", "5"]
            pair = line.strip().split()
            assert len(pair) >= 2

            # Separate word/freq
            words, freq = pair[:-1], pair[-1]
            assert freq.isdigit()

            # Append new pair to freq_list
            freq_list.append([words, freq])
        
        # Close reading file
        file_open.close()
        
        # Get article/file ID
        pattern = "Result_journal-article-10.2307_{art_id}-" + direct + ".txt"
        art_id = parse_id(filename, pattern)
        
        # Append to overall list
        freq_lists.update({art_id : freq_list})
            
    return freq_lists

In [66]:
"""
Collect data from ngrams
"""
# Sequence is very important here since it matters the sequence we save data in "ngrams"
assert directories == sorted(directories)

ngrams = []
for direct in directories:
    # Collect data
    data = collect_data(direct)
    assert len(data) > 0

    ngrams.append(data)

"""
Test freq_lists
"""
index = 1
for key in ngrams[index]:
    print(key)
    display(ngrams[index][key])

145208
[]
     [['quit', 'rates'], '39']
     [['firm', 'size'], '30']
     [['quit', 'rate'], '27']
     [['turnover', 'costs'], '25']
     [['private', 'sector'], '24']
     [['government', 'private'], '10']
     [['relationship', 'between'], '10']
     [['large', 'firms'], '9']
     [['human', 'resources'], '8']
     [['journal', 'human'], '8']
     [['size', 'government'], '8']
     [['size', 'earnings'], '7']
     [['between', 'government'], '6']
     [['cross', 'state'], '6']
     [['difference', 'between'], '6']
     [['differences', 'between'], '6']
     [['federal', 'pay'], '6']
     [['more', 'than'], '6']
     [['pay', 'comparability'], '6']
     [['blue', 'collar'], '5']
     [['comparable', 'pay'], '5']
     [['government', 'pay'], '5']
     [['public', 'sector'], '5']
     [['than', 'private'], '5']
     [['turnover', 'statistics'], '5']
     [['white', 'collar'], '5']
     [['across', 'industries'], '4']
     [['alexandria', 'va'], '4']
     [['any', 'wage'], '4']
     [

     [['results', 'can'], '1']
     [['results', 'contrary'], '1']
     [['results', 'following'], '1']
     [['results', 'from'], '1']
     [['results', 'matched'], '1']
     [['results', 'quit'], '1']
     [['results', 'table'], '1']
     [['retary', 'defense'], '1']
     [['retention', 'rate'], '1']
     [['retention', 'rates'], '1']
     [['retention', 'see'], '1']
     [['review', 'studies'], '1']
     [['richard', 'lester'], '1']
     [['right', 'equation'], '1']
     [['role', 'choice'], '1']
     [['role', 'constituents'], '1']
     [['routinely', 'collected'], '1']
     [['rule', 'government'], '1']
     [['rule', 'private'], '1']
     [['s', 'conclusions'], '1']
     [['s', 'curve'], '1']
     [['s', 'estimate'], '1']
     [['s', 'findings'], '1']
     [['s', 'jacobson'], '1']
     [['s', 'pay'], '1']
     [['s', 'perspective'], '1']
     [['s', 'stated'], '1']
     [['salaries', 'industrial'], '1']
     [['salaries', 'white'], '1']
     [['same', 'job'], '1']
     [['same', 

     [['older', 'persons'], '1']
     [['one', 'following'], '1']
     [['one', 'half'], '1']
     [['one', 'measures'], '1']
     [['one', 'purpose'], '1']
     [['one', 'quarter'], '1']
     [['only', 'beneficiaries'], '1']
     [['only', 'do'], '1']
     [['only', 'slightly'], '1']
     [['only', 'summer'], '1']
     [['order', 'magnitude'], '1']
     [['other', 'studies'], '1']
     [['otherwise', 'would'], '1']
     [['our', 'data'], '1']
     [['our', 'findings'], '1']
     [['our', 'method'], '1']
     [['our', 'study'], '1']
     [['over', 'life'], '1']
     [['over', 'stating'], '1']
     [['overeducated', 'american'], '1']
     [['paid', 'during'], '1']
     [['paid', 'excluding'], '1']
     [['paid', 'than'], '1']
     [['parentheses', 'high'], '1']
     [['parentheses', 'men'], '1']
     [['part', 'from'], '1']
     [['part', 'time'], '1']
     [['past', 'taxes'], '1']
     [['pattern', 'women'], '1']
     [['pay', 'social'], '1']
     [['payroll', 'tax'], '1']
     [['payr

     [['elasticity', 'minarik'], '1']
     [['elasticity', 'respect'], '1']
     [['elements', 'which'], '1']
     [['eligibility', 'afdc'], '1']
     [['eligibility', 'current'], '1']
     [['eligibility', 'equiva'], '1']
     [['eligibility', 'extent'], '1']
     [['eligibility', 'families'], '1']
     [['eligibility', 'programs'], '1']
     [['eligibility', 'subsidy'], '1']
     [['eligibility', 'targeted'], '1']
     [['eligibility', 'unemployment'], '1']
     [['eligibility', 'were'], '1']
     [['eligible', 'female'], '1']
     [['eligible', 'food'], '1']
     [['eligible', 'he'], '1']
     [['eligible', 'more'], '1']
     [['eligible', 'negative'], '1']
     [['eligible', 'payments'], '1']
     [['eligible', 'three'], '1']
     [['eligible', 'welfare'], '1']
     [['eligible', 'were'], '1']
     [['eliminate', 'discrepancy'], '1']
     [['elise', 'k'], '1']
     [['elizabeth', 'm'], '1']
     [['else', 'filer'], '1']
     [['empirical', 'analysis'], '1']
     [['empirical', 'evi

     [['possibility', 'receipt'], '1']
     [['possible', 'distinguish'], '1']
     [['possible', 'dropping'], '1']
     [['possible', 'exception'], '1']
     [['possible', 'full'], '1']
     [['possible', 'initial'], '1']
     [['possible', 'make'], '1']
     [['possible', 'some'], '1']
     [['possibly', 'even'], '1']
     [['potential', 'biases'], '1']
     [['potential', 'explanation'], '1']
     [['potential', 'modification'], '1']
     [['potential', 'source'], '1']
     [['poverty', 'areas'], '1']
     [['poverty', 'been'], '1']
     [['poverty', 'discussion'], '1']
     [['poverty', 'exists'], '1']
     [['poverty', 'inequality'], '1']
     [['poverty', 'line'], '1']
     [['poverty', 'lower'], '1']
     [['poverty', 'one'], '1']
     [['poverty', 'population'], '1']
     [['poverty', 'support'], '1']
     [['poverty', 'threshold'], '1']
     [['poverty', 'variable'], '1']
     [['practices', 'may'], '1']
     [['practices', 'ross'], '1']
     [['practices', 'using'], '1']
    

     [['h', 'summers'], '1']
     [['h', 'toikka'], '1']
     [['hall', 'arden'], '1']
     [['hamermesh', 'daniel'], '1']
     [['hanushek', 'economics'], '1']
     [['harold', 's'], '1']
     [['harvey', 's'], '1']
     [['health', 'care'], '1']
     [['health', 'perspectives'], '1']
     [['health', 'services'], '1']
     [['henry', 'm'], '1']
     [['higher', 'education'], '1']
     [['hill', 'martha'], '1']
     [['hoenack', 'stephen'], '1']
     [['hoffman', 'saul'], '1']
     [['hogan', 'government'], '1']
     [['holden', 'karen'], '1']
     [['holt', 'charles'], '1']
     [['honig', 'marjorie'], '1']
     [['hosek', 'james'], '1']
     [['hu', 'teh'], '1']
     [['hutchens', 'job'], '1']
     [['hyclak', 'james'], '1']
     [['hyclak', 'thomas'], '1']
     [['hypothesis', 'u'], '1']
     [['i', 'journal'], '1']
     [['i', 'levin'], '1']
     [['ill', 'burton'], '1']
     [['impact', 'differences'], '1']
     [['index', 'volume'], '1']
     [['induced', 'demand'], '1']
     [[

     [['system', 'year'], '1']
     [['systematic', 'change'], '1']
     [['systems', 'higher'], '1']
     [['t', 'ratios'], '1']
     [['table', 'shows'], '1']
     [['take', 'account'], '1']
     [['take', 'advantage'], '1']
     [['taken', 'account'], '1']
     [['taken', 'together'], '1']
     [['taking', 'place'], '1']
     [['tant', 'determining'], '1']
     [['tant', 'explaining'], '1']
     [['taught', 'different'], '1']
     [['taught', 'each'], '1']
     [['taught', 'high'], '1']
     [['taught', 'low'], '1']
     [['taught', 'probability'], '1']
     [['taught', 'rela'], '1']
     [['taught', 'school'], '1']
     [['taught', 'schools'], '1']
     [['taught', 'system'], '1']
     [['taught', 'year'], '1']
     [['teach', 'innovative'], '1']
     [['teacher', 'american'], '1']
     [['teacher', 'coefficient'], '1']
     [['teacher', 'does'], '1']
     [['teacher', 'equations'], '1']
     [['teacher', 'examining'], '1']
     [['teacher', 'experience'], '1']
     [['teacher', 'h

     [['all', 'other'], '1']
     [['all', 'quitters'], '1']
     [['all', 'variables'], '1']
     [['allocation', 'manpower'], '1']
     [['also', 'found'], '1']
     [['also', 'hours'], '1']
     [['also', 'induces'], '1']
     [['also', 'influence'], '1']
     [['also', 'influenced'], '1']
     [['also', 'may'], '1']
     [['also', 'model'], '1']
     [['also', 'play'], '1']
     [['alternative', 'interpretation'], '1']
     [['although', 'other'], '1']
     [['although', 'search'], '1']
     [['although', 'unreported'], '1']
     [['american', 'eco'], '1']
     [['american', 'statistical'], '1']
     [['amortize', 'fixed'], '1']
     [['analogous', 'labor'], '1']
     [['analyses', 'explicitly'], '1']
     [['analysis', 'appears'], '1']
     [['analysis', 'focuses'], '1']
     [['analysis', 'found'], '1']
     [['analysis', 'imputed'], '1']
     [['analysis', 'paper'], '1']
     [['analysis', 'positive'], '1']
     [['analysis', 'serves'], '1']
     [['analysis', 'were'], '1']
    

     [['s', 'republic'], '1']
     [['s', 'workers'], '1']
     [['sage', 'publications'], '1']
     [['savery', 'phillip'], '1']
     [['schenkman', 'publishing'], '1']
     [['schirmer', 'limits'], '1']
     [['school', 'finance'], '1']
     [['sector', 'reading'], '1']
     [['security', 'disability'], '1']
     [['senior', 'settlers'], '1']
     [['services', 'change'], '1']
     [['services', 'poor'], '1']
     [['settlers', 'social'], '1']
     [['siddiqur', 'rahman'], '1']
     [['skrovan', 'ed'], '1']
     [['social', 'health'], '1']
     [['social', 'integration'], '1']
     [['social', 'security'], '1']
     [['social', 'work'], '1']
     [['some', 'consequences'], '1']
     [['south', 'western'], '1']
     [['sp0rgeskemaer', 'problemer'], '1']
     [['sp0rgsmalformulering', 'i'], '1']
     [['standard', 'help'], '1']
     [['stanley', 'w'], '1']
     [['state', 'committee'], '1']
     [['states', 'commission'], '1']
     [['states', 'washington'], '1']
     [['statistical', 

     [['might', 'well'], '2']
     [['missed', 'from'], '2']
     [['monetary', 'terms'], '2']
     [['months', 'after'], '2']
     [['months', 'e'], '2']
     [['more', 'costly'], '2']
     [['more', 'satisfied'], '2']
     [['more', 'social'], '2']
     [['national', 'institute'], '2']
     [['natural', 'causes'], '2']
     [['neighbors', 'coworkers'], '2']
     [['new', 'treatment'], '2']
     [['non', 'ill'], '2']
     [['non', 'mmhi'], '2']
     [['number', 'arrests'], '2']
     [['number', 'contacts'], '2']
     [['obtain', 'relevant'], '2']
     [['obtain', 'useful'], '2']
     [['often', 'far'], '2']
     [['one', 'potential'], '2']
     [['one', 'treatment'], '2']
     [['only', 'when'], '2']
     [['other', 'aspects'], '2']
     [['other', 'community'], '2']
     [['other', 'persons'], '2']
     [['other', 'psychotics'], '2']
     [['other', 'than'], '2']
     [['otherwise', 'have'], '2']
     [['outpatient', 'experimental'], '2']
     [['over', 'time'], '2']
     [['overnigh

     [['leads', 'rehospitali'], '1']
     [['least', 'disruptive'], '1']
     [['least', 'part'], '1']
     [['least', 'partially'], '1']
     [['least', 'temporarily'], '1']
     [['leave', 'e'], '1']
     [['leave', 'much'], '1']
     [['led', 'adjustment'], '1']
     [['led', 'both'], '1']
     [['led', 'higher'], '1']
     [['led', 'however'], '1']
     [['led', 'many'], '1']
     [['lee', 'olivia'], '1']
     [['lee', 'value'], '1']
     [['left', 'each'], '1']
     [['leisure', 'activities'], '1']
     [['leisure', 'being'], '1']
     [['leisure', 'even'], '1']
     [['leisure', 'first'], '1']
     [['leisure', 'table'], '1']
     [['leisure', 'time'], '1']
     [['length', 'stay'], '1']
     [['leonard', 'stein'], '1']
     [['less', 'choice'], '1']
     [['less', 'costly'], '1']
     [['less', 'effective'], '1']
     [['let', 'alone'], '1']
     [['level', 'better'], '1']
     [['level', 'between'], '1']
     [['level', 'cost'], '1']
     [['level', 'costs'], '1']
     [['level

     [['work', 'because'], '1']
     [['work', 'being'], '1']
     [['work', 'data'], '1']
     [['work', 'decreased'], '1']
     [['work', 'difficult'], '1']
     [['work', 'etc'], '1']
     [['work', 'experience'], '1']
     [['work', 'he'], '1']
     [['work', 'mentally'], '1']
     [['work', 'nonwork'], '1']
     [['work', 'one'], '1']
     [['work', 'other'], '1']
     [['work', 'participation'], '1']
     [['work', 'patients'], '1']
     [['work', 'presented'], '1']
     [['work', 'receives'], '1']
     [['work', 'school'], '1']
     [['work', 'setting'], '1']
     [['work', 'shop'], '1']
     [['work', 'shorter'], '1']
     [['work', 'time'], '1']
     [['work', 'university'], '1']
     [['work', 'would'], '1']
     [['worked', 'directly'], '1']
     [['worked', 'more'], '1']
     [['worked', 'pilot'], '1']
     [['worked', 'wages'], '1']
     [['workers', 'efficiency'], '1']
     [['workers', 'patients'], '1']
     [['workers', 'were'], '1']
     [['working', 'became'], '1']
  

     [['term', 'mental'], '1']
     [['terms', 'financing'], '1']
     [['than', 'does'], '1']
     [['than', 'inde'], '1']
     [['than', 'indirect'], '1']
     [['than', 'physical'], '1']
     [['them', 'suggest'], '1']
     [['thera', 'peutic'], '1']
     [['therapeutic', 'approach'], '1']
     [['therapeutic', 'value'], '1']
     [['therapies', 'focal'], '1']
     [['therapies', 'poses'], '1']
     [['therapies', 'problems'], '1']
     [['therapies', 'what'], '1']
     [['therapies', 'where'], '1']
     [['therapies', 'wishing'], '1']
     [['therapists', 'klevorick'], '1']
     [['therapy', 'ambiguity'], '1']
     [['therapy', 'progress'], '1']
     [['therapy', 'surprisingly'], '1']
     [['therapy', 'treatment'], '1']
     [['therefore', 'coverage'], '1']
     [['those', 'traditional'], '1']
     [['those', 'who'], '1']
     [['thousands', 'persons'], '1']
     [['thread', 'running'], '1']
     [['through', 'all'], '1']
     [['through', 'licensing'], '1']
     [['thus', 'all'],

     [['pick', 'up'], '1']
     [['pick', 'what'], '1']
     [['picked', 'match'], '1']
     [['picked', 'yield'], '1']
     [['place', 'ceiling'], '1']
     [['planning', 'evaluation'], '1']
     [['plausible', 'range'], '1']
     [['plausible', 'scenario'], '1']
     [['plausible', 'scenarios'], '1']
     [['plausible', 'since'], '1']
     [['plotnick', 'demographic'], '1']
     [['plotnick', 'felicity'], '1']
     [['plotnick', 'finds'], '1']
     [['plotnick', 'real'], '1']
     [['plotnick', 'skidmore'], '1']
     [['plotnick', 'tim'], '1']
     [['plus', 'census'], '1']
     [['plus', 'kind'], '1']
     [['pockets', 'future'], '1']
     [['point', 'specifying'], '1']
     [['point', 'yields'], '1']
     [['points', 'fall'], '1']
     [['points', 'over'], '1']
     [['policies', 'new'], '1']
     [['policies', 'would'], '1']
     [['policy', 'changes'], '1']
     [['policy', 'conclusions'], '1']
     [['policy', 'decisions'], '1']
     [['policy', 'incidence'], '1']
     [['policy

     [['furthermore', 'we'], '1']
     [['future', 'earnings'], '1']
     [['future', 'employability'], '1']
     [['future', 'employment'], '1']
     [['future', 'racial'], '1']
     [['future', 'returns'], '1']
     [['future', 'training'], '1']
     [['future', 'wage'], '1']
     [['gain', 'compared'], '1']
     [['gap', 'literature'], '1']
     [['gaps', 'available'], '1']
     [['general', 'experience'], '1']
     [['generally', 'consistent'], '1']
     [['generally', 'positive'], '1']
     [['generally', 'termed'], '1']
     [['generate', 'information'], '1']
     [['generating', 'inequality'], '1']
     [['generating', 'information'], '1']
     [['genuinely', 'expected'], '1']
     [['geographic', 'differences'], '1']
     [['geographic', 'region'], '1']
     [['get', 'off'], '1']
     [['gitudinal', 'studies'], '1']
     [['give', 'indication'], '1']
     [['given', 'brief'], '1']
     [['given', 'high'], '1']
     [['given', 'nature'], '1']
     [['given', 'negative'], '1']
  

     [['published', 'four'], '1']
     [['r', 'smith'], '1']
     [['range', 'poverty'], '1']
     [['rashi', 'fein'], '1']
     [['regents', 'university'], '1']
     [['relate', 'labor'], '1']
     [['relations', 'centers'], '1']
     [['relations', 'cornell'], '1']
     [['resale', 'pre'], '1']
     [['research', 'area'], '1']
     [['research', 'entire'], '1']
     [['research', 'graduate'], '1']
     [['research', 'irwin'], '1']
     [['research', 'teaching'], '1']
     [['research', 'unit'], '1']
     [['resources', 'provides'], '1']
     [['resources', 'pu'], '1']
     [['reynolds', 'farley'], '1']
     [['richard', 'j'], '1']
     [['richard', 'perlman'], '1']
     [['rivlin', 'congressional'], '1']
     [['robert', 'h'], '1']
     [['robert', 'j'], '1']
     [['robinson', 'hollister'], '1']
     [['robinson', 'statistical'], '1']
     [['role', 'education'], '1']
     [['rosen', 'u'], '1']
     [['run', 'structural'], '1']
     [['s', 'copyright'], '1']
     [['s', 'depart'], '

     [['indicate', 'existence'], '1']
     [['indigent', 'while'], '1']
     [['indivi', 'dual'], '1']
     [['individual', 'changes'], '1']
     [['individual', 'data'], '1']
     [['individual', 'distribu'], '1']
     [['individual', 'from'], '1']
     [['individual', 'survey'], '1']
     [['individual', 'therapy'], '1']
     [['individual', 'whose'], '1']
     [['individual', 'would'], '1']
     [['individuals', 'who'], '1']
     [['industrial', 'health'], '1']
     [['industrial', 'wages'], '1']
     [['inefficiency', 'price'], '1']
     [['influ', 'ence'], '1']
     [['influence', 'decision'], '1']
     [['influence', 'other'], '1']
     [['influence', 'outpatient'], '1']
     [['infor', 'mation'], '1']
     [['information', 'comprehensive'], '1']
     [['information', 'impossible'], '1']
     [['information', 'only'], '1']
     [['ing', 'real'], '1']
     [['initial', 'barriers'], '1']
     [['initial', 'coverage'], '1']
     [['initial', 'visits'], '1']
     [['initially', 'duri

     [['mentally', 'ill'], '1']
     [['merriam', 'washington'], '1']
     [['michigan', 'rashi'], '1']
     [['milwaukee', 'philip'], '1']
     [['minnesota', 'kenneth'], '1']
     [['morgan', 'internal'], '1']
     [['morgan', 'paul'], '1']
     [['murray', 'street'], '1']
     [['must', 'prepay'], '1']
     [['n', 'bthe'], '1']
     [['n', 'j'], '1']
     [['n', 'morgan'], '1']
     [['national', 'bureau'], '1']
     [['national', 'supported'], '1']
     [['new', 'collective'], '1']
     [['north', 'murray'], '1']
     [['northwestern', 'james'], '1']
     [['now', 'one'], '1']
     [['observatory', 'drive'], '1']
     [['office', 'howard'], '1']
     [['office', 'journal'], '1']
     [['office', 'social'], '1']
     [['offices', 'editor'], '1']
     [['one', 'larget'], '1']
     [['opportunities', 'income'], '1']
     [['other', 'institutions'], '1']
     [['other', 'kinds'], '1']
     [['p', 'dickinson'], '1']
     [['page', 'article'], '1']
     [['paid', 'madison'], '1']
     [[

In [None]:
"""
Create DataFrames:
    art_id; word_ngram_1; freq_ngram_1; 
    art_id; word1_ngram_2; word2_ngram_2; freq_ngram_2; 
    art_id; word1_ngram_3; word2_ngram_3; word3_ngram_3; freq_ngram_3; 
"""        

In [70]:
"""
Define function to create DataFrame for N-Gram-X
"""
def create_DataFrame(columns):
    # An API variable indicating which ngram should be selected from ngrams
    # The calculation is because we eliminate the art_id and freq_ngram_x and then -1, totally 3
    ngram_num = len(columns) - 3

    """
    DATA
    """
    # Data
    data = []

    # Get article ID list
    # Review that we get art_id_lst in this way only if we are sure that ngram1/2/3 share same articles (IDs)
    assert check_co_exist
    art_id_lst = art_id_lsts[0]

    # Generate data by lines
    for art_id in art_id_lst:

        # Make sure the sequence is correct that matches the dataframe column titles
        # ngram -> {ID1: freq_list1, ID2: freq_list2}
        # freq_list -> [[words0, freq0], [words1, freq1]]
        ngram = ngrams[ngram_num]
        assert art_id in ngram.keys()

        # Get freq_list
        freq_list = ngram[art_id]
        assert len(freq_list) > 0
        assert len(freq_list[0]) >= 2

        # Treat unigram and others seperately
        for pair in freq_list:
            # dataline -> [art_id, word_ngram_1, freq_ngram_1] as section title comment issues
            dataline = []

            # Add art_id to dataline
            dataline.append(art_id)

            # pair -> [[words], freq]
            assert len(pair) == 2

            # Separate word/freq
            words, freq = pair[0], pair[1]
            assert freq.isdigit()
            for word in words:
                dataline.append(word)
            dataline.append(freq)
            
            # Now the dataline should be complete
            data.append(dataline)

    """
    Columns
    """
    columns = columns

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)

    """
    Test DataFrame
    """
    print(dataframe.head())

In [71]:
"""
All Columns features for N-Gram [1-3]
"""
ngram_columns = [
    ["art_id", "word_ngram_1", "freq_ngram_1"],
    ["art_id", "word1_ngram_2", "word2_ngram_2", "freq_ngram_2"],
    ["art_id", "word1_ngram_3", "word2_ngram_3", "word3_ngram_3", "freq_ngram_3"]
]

In [74]:
"""
DataFrame Creation for N-Gram [1-3]
"""
dataframe1 = create_DataFrame(ngram_columns[0])
dataframe2 = create_DataFrame(ngram_columns[1])
dataframe3 = create_DataFrame(ngram_columns[2])

   art_id word_ngram_1 freq_ngram_1
0  145292    licensing           58
1  145292      written           40
2  145292  examination           32
3  145292         test           32
4  145292      workers           27
   art_id word1_ngram_2 word2_ngram_2 freq_ngram_2
0  145292       written          test           17
1  145292  occupational     licensing           12
2  145292             i             i           11
3  145292       written   examination           11
4  145292     licensing  examinations           10
   art_id word1_ngram_3 word2_ngram_3 word3_ngram_3 freq_ngram_3
0  145292             i             i             i            9
1  145292       journal         human     resources            6
2  145292       written     licensing  examinations            4
3  145292       attempt        obtain       license            2
4  145292    attributes     important       passing            2
