In [1]:
import os
from os import listdir
from os.path import isfile, join

import re
import sys
import time
from collections import Counter

import pandas as pd
import numpy as np

import csv
import zipfile
import pickle

import sqlite3

## [Method] Display Fuction

In [2]:
"""
Get name of an object
"""
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

"""
Display with format
"""
def display(items, func=None, limit=None):
    # Print Variable Name
    print(namestr(items, globals()))
    # Print Content
    count = 0
    for item in items:
        # Consider Limit
        if limit is not None and count >= limit:
            return
        # Consider Exerted Function
        if func:
            item = func(item)
        # Print Each Item
        print("     {0}".format(item))
        count += 1

### Test ###
# test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
# display(test_dict)

## [Method] Display Progress

In [3]:
def report_progress(progress, total, lbar_prefix = '', rbar_prefix=''):
    percent = round(progress / float(total) * 100, 2)
    buf = "{0}|{1}| {2}{3}/{4} {5}% ".format(lbar_prefix, ('#' * round(percent)).ljust(100, '-'),
        rbar_prefix, progress, total, percent)
    sys.stdout.write(buf)
    sys.stdout.write('\r')
    sys.stdout.flush()


def report_progress_done():
    sys.stdout.write('\n')

### TEST ###
# total = 100
# report_progress(0, total)
# for progress in range(1, total + 1):
#     time.sleep(0.1)
#     report_progress(progress, total)
# report_progress_done()

## [Definition] Define Paths
   * Dictionaries: ./Dictionaries [Culture; Demographics; Relational]
   * Articles: ../All_Articles [Part 001-098]

In [4]:
"""
Paths for Dictionaries
"""
dictionary_root = "./Dictionaries"
dictionary_path = {}
dictionary_name_list = [
    "Culture",
    "Demographic",
    "Relational",
]

for dictionary_name in dictionary_name_list:
    dictionary_path[dictionary_name] = join(dictionary_root, dictionary_name + ".csv")

"""
Paths for Articles
"""
articles_root = "../Soc_MGT_OB_1980_2018"

"""
Paths for Database
"""
db_root = "./"
db_name = "my_result.db"

### TEST ###
display(dictionary_path.items())

[]
     ('Culture', './Dictionaries/Culture.csv')
     ('Demographic', './Dictionaries/Demographic.csv')
     ('Relational', './Dictionaries/Relational.csv')


## [Method] Article Zip File Validation By Filename

In [5]:
""" 
Assert the filename in format "receipt-id-989431-part-XXX.zip"
where XXX stands for article set number
"""
def valid_zip(filename):
    return re.match("^receipt-id-989431-part-.+.zip$", filename)

## [Application] Article Zip File Validation By Filename

In [6]:
"""
Get filtering results
"""
articles_zip_file_list = sorted([filename for filename in os.listdir(articles_root) if valid_zip(filename)])
display(articles_zip_file_list, limit=5)

['articles_zip_file_list']
     receipt-id-989431-part-001.zip
     receipt-id-989431-part-002.zip
     receipt-id-989431-part-003.zip
     receipt-id-989431-part-004.zip
     receipt-id-989431-part-005.zip


## [Method] Parse Article Set ID

In [7]:
""" Define function getting article set ID by the zip-file-name
Pattern: receipt-id-989431-part-XXX.zip
"""
def parse_article_set_id(filename):
    id_number_lst = re.findall("receipt-id-989431-part-(.+).zip", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
# print(parse_article_set_id("receipt-id-752441-part-000.zip"))

## [Method] Parse Article Number ID

In [8]:
""" Define function getting article ID by the file-name
Pattern: journal-article-10.2307_00000000-ngram1.txt
"""
def parse_article_id(filename):
    id_number_lst = re.findall("journal-article-(.+)-+", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    id_number_lst = re.findall("journal-article-(.+)\.+", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
print(parse_article_id("journal-article-10.2307_00000000-ngram1.txt"))
print(parse_article_id("journal-article-10.2307_00000000.xml"))

10.2307_00000000
10.2307_00000000


## [Method] Parse Directory From Path

In [9]:
def parse_directory_from_path(file_path):
    path_directory = re.findall("\w+/", file_path)
    if len(path_directory) == 1:
        directory, filename = file_path.split("/")
        return directory, filename
    print("Parse_Directory Error: Multiple directories embedded or no directory exist")
    
# TEST
print(parse_directory_from_path("metadata/journal-article-10.2307_3323388.xml"))

('metadata', 'journal-article-10.2307_3323388.xml')


## [Method] Group and Count Files In Zipfile

In [10]:
def group_and_count(file_paths):
    groups = dict()
    for file_path in file_paths:
        assert isinstance(file_path, str)
        path_directory, filename = parse_directory_from_path(file_path)
        
        if filename.startswith("journal"):
            file_id = parse_article_id(filename)
            if path_directory in groups:
                groups[path_directory].add(file_id)
            else:
                groups[path_directory] = set([file_id])
    return groups

## [Method] Element Counts For Each Key In Dictionary
* Key -> Directory Name
* Dictionary -> Grouped Dictionaries

In [11]:
def count_for_each_directory(grouped_directories):
    counts = dict()
    for key in sorted(grouped_directories.keys()):
        counts[key] = len(grouped_directories[key]) 
    return counts

## [Method] Find Differences In File IDs Between Each Folders

In [12]:
def diffs_between_folders(grouped_directories):
    """
    :return => {"standard_array_name": "metadata/", 
                "ngram1": np.array(differences between ngram1's filenames and standard_array_name's)}
    """
    results = dict()
    
    keys = sorted(grouped_directories.keys())
    standard_array_name = keys.pop(0) # Usually this is "metadata/"
    results["standard_array_name"] = standard_array_name
    
    standard_array = np.asarray(list(grouped_directories[standard_array_name]))
    
    for other_key in keys:
        filenames_array = np.asarray(list(grouped_directories[other_key]))
        results[other_key] = np.setdiff1d(standard_array, filenames_array)
    return results

## [Method] Display Filenames In Zipfile

In [13]:
def display_filenames():
    counts_in_zipfiles = dict()
    differences_in_zipfiles = dict()
    
    # Iterate Through All Article Sets
    for article_zip_file_name in articles_zip_file_list:
        
        # Extract Data Set ID
        data_set_id = parse_article_set_id(article_zip_file_name)
        
        # Path Format => "../Soc_MGT&OB_1980_2018/receipt-id-989431-part-001.zip"
        article_zip_file_path = join(articles_root, article_zip_file_name)
        
        """
        TODO: E.g: Zip-003/012 Cannot Be Openned
        """
        try:
            # Read The Zip File Without Unzipping
            zip_file = zipfile.ZipFile(article_zip_file_path)
            file_name_list = zip_file.namelist()

            # {"metadata" : [filename1, filename2, ..], "ngram1": [filename1', ...]}
            grouped_directories = group_and_count(file_name_list)

            '''
            for key in grouped_directories.keys():
                display(grouped_directories[key])
            return
            '''

            # {"metadata" : 12, "ngram1" : 12, ...}
            num_file_counts = count_for_each_directory(grouped_directories)

            # Compare differences between files in folders
            differences = diffs_between_folders(grouped_directories)

            counts_in_zipfiles[data_set_id] = num_file_counts
            differences_in_zipfiles[data_set_id] = differences
        except:
            continue
        
    return counts_in_zipfiles, differences_in_zipfiles

In [14]:
counts_in_zipfiles, differences_in_zipfiles = display_filenames()

In [15]:
"""
Counts
"""
columns = ["set_id", "metadata", "ngram1", "ngram2", "ngram3", "ocr"]
data = []
for zip_id in counts_in_zipfiles:
    dataline = []
    dataline.append(zip_id)
    zip_info = counts_in_zipfiles[zip_id]
    for folder_name in sorted(zip_info.keys()):
        dataline.append(zip_info[folder_name])
    data.append(dataline)
index = list(range(len(data)))
df_counts = pd.DataFrame(data, columns=columns, index=index)

In [17]:
df_counts

Unnamed: 0,set_id,metadata,ngram1,ngram2,ngram3,ocr
0,1,29073,29071,29070,29070,29067
1,2,28891,28893,28893,28892,28895
2,3,29121,29121,29119,29118,29116
3,4,29038,29038,29040,29038,29039
4,5,29202,29201,29197,29196,29192
5,6,28884,28883,28887,28890,28894
6,7,29147,29146,29144,29144,29144
7,8,29066,29065,29064,29062,29062
8,9,28902,28906,28907,28907,28907
9,10,29244,29250,29250,29248,29245


In [18]:
"""
Differences Explicit
"""
columns = ["set_id", "metadata", "ngram1", "ngram2", "ngram3", "ocr"]
data = []
for zip_id in differences_in_zipfiles:
    dataline = []
    dataline.append(zip_id)
    zip_info = differences_in_zipfiles[zip_id]
    for folder_type in zip_info:
        dataline.append(zip_info[folder_type])
    data.append(dataline)
    
index = list(range(len(data)))
df_diffs = pd.DataFrame(data, columns=columns, index=index)

In [33]:
df_diffs

Unnamed: 0,set_id,metadata,ngram1,ngram2,ngram3,ocr
0,1,metadata,"[10.2307_1042756, 10.2307_23003313]","[10.2307_1042756, 10.2307_23003313, 10.2307_23...","[10.2307_1042756, 10.2307_23003313, 10.2307_23...","[10.2307_1040285, 10.2307_1042756, 10.2307_230..."
1,2,metadata,[],[10.2307_1387656],"[10.2307_1387656, 10.2307_20122454]","[10.2307_1387656, 10.2307_20122454]"
2,3,metadata,[],"[10.2307_23263492, 10.2307_2578266, 10.2307_40...","[10.2307_20850114, 10.2307_23263492, 10.2307_2...","[10.1086_338780, 10.2307_20850114, 10.2307_232..."
3,4,metadata,[],[],"[10.2307_23016305, 10.2307_2668076, 10.2307_30...","[10.2307_2094883, 10.2307_23016305, 10.2307_26..."
4,5,metadata,[10.2307_3010919],"[10.2307_26162514, 10.2307_3010919, 10.2307_41...","[10.2307_1040241, 10.2307_1049672, 10.2307_207...","[10.2307_1040241, 10.2307_1049672, 10.2307_207..."
5,6,metadata,"[10.2307_1171587, 10.2307_3599930]","[10.2307_1171587, 10.2307_3599930]","[10.2307_1171587, 10.2307_23745150, 10.2307_25...","[10.2307_1171587, 10.2307_23745150, 10.2307_25..."
6,7,metadata,"[10.2307_1318720, 10.2307_3807219, 10.2307_410...","[10.2307_1318720, 10.2307_1389092, 10.2307_301...","[10.2307_1318720, 10.2307_1389092, 10.2307_245...","[10.2307_1318720, 10.2307_1389092, 10.2307_245..."
7,8,metadata,"[10.2307_1039510, 10.2307_26290743, 10.2307_40...","[10.2307_1039510, 10.2307_23252599, 10.2307_26...","[10.1086_505277, 10.2307_1039510, 10.2307_2071...","[10.1086_505277, 10.2307_1039510, 10.2307_2071..."
8,9,metadata,[],"[10.2307_30118788, 10.2307_43242608]","[10.2307_2064442, 10.2307_30118788, 10.2307_41...","[10.2307_1317088, 10.2307_2064442, 10.2307_301..."
9,10,metadata,"[10.2307_2063435, 10.2307_24706882, 10.2307_24...","[10.2307_2063435, 10.2307_23159536, 10.2307_24...","[10.1086_428337, 10.2307_2063435, 10.2307_2315...","[10.1086_428337, 10.2307_2063435, 10.2307_2315..."


In [41]:
data = []
columns = ["set_id", "np_diff_ids"]
for index, row in df_diffs.iterrows():
    dataline = []
    set_id = row["set_id"]
    ngram1 = row["ngram1"]
    ngram2 = row["ngram2"]
    ngram3 = row["ngram3"]
    all_diffs = np.concatenate((ngram1, ngram2, ngram3), axis=0)
    all_diffs = np.unique(all_diffs)
    dataline.append(set_id)
    dataline.append(all_diffs)
    data.append(dataline)
index = list(range(len(data)))
df_diffs_sum = pd.DataFrame(data, columns=columns, index=index)

In [42]:
df_diffs_sum

Unnamed: 0,set_id,np_diff_ids
0,1,"[10.2307_1042756, 10.2307_23003313, 10.2307_23..."
1,2,"[10.2307_1387656, 10.2307_20122454]"
2,3,"[10.2307_20850114, 10.2307_23263492, 10.2307_2..."
3,4,"[10.2307_23016305, 10.2307_2668076, 10.2307_30..."
4,5,"[10.2307_1040241, 10.2307_1049672, 10.2307_207..."
5,6,"[10.2307_1171587, 10.2307_23745150, 10.2307_25..."
6,7,"[10.2307_1318720, 10.2307_1389092, 10.2307_245..."
7,8,"[10.1086_505277, 10.2307_1039510, 10.2307_2071..."
8,9,"[10.2307_2064442, 10.2307_30118788, 10.2307_41..."
9,10,"[10.1086_428337, 10.2307_2063435, 10.2307_2315..."


In [43]:
with open("./df_diffs_sum.gz", "wb") as df_diffs_sum_file:
    pickle.dump(df_diffs_sum, df_diffs_sum_file)

In [44]:
with open("./df_diffs_sum.gz", "rb") as df_diffs_sum_file:
    df_diffs_sum = pickle.load(df_diffs_sum_file)

In [45]:
df_diffs_sum

Unnamed: 0,set_id,np_diff_ids
0,1,"[10.2307_1042756, 10.2307_23003313, 10.2307_23..."
1,2,"[10.2307_1387656, 10.2307_20122454]"
2,3,"[10.2307_20850114, 10.2307_23263492, 10.2307_2..."
3,4,"[10.2307_23016305, 10.2307_2668076, 10.2307_30..."
4,5,"[10.2307_1040241, 10.2307_1049672, 10.2307_207..."
5,6,"[10.2307_1171587, 10.2307_23745150, 10.2307_25..."
6,7,"[10.2307_1318720, 10.2307_1389092, 10.2307_245..."
7,8,"[10.1086_505277, 10.2307_1039510, 10.2307_2071..."
8,9,"[10.2307_2064442, 10.2307_30118788, 10.2307_41..."
9,10,"[10.1086_428337, 10.2307_2063435, 10.2307_2315..."
