# Mapping Dictionary Features To All Articles

## [Definition] Import Dependencies

In [58]:
import os
from os import listdir
from os.path import isfile, join

import re
import sys
import time
from collections import Counter

import pandas as pd
import numpy as np

import csv
import zipfile

import sqlite3

## [Method] Display Fuction

In [59]:
"""
Get name of an object
"""
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

"""
Display with format
"""
def display(items, func=None, limit=None):
    # Print Variable Name
    print(namestr(items, globals()))
    # Print Content
    count = 0
    for item in items:
        # Consider Limit
        if limit is not None and count >= limit:
            return
        # Consider Exerted Function
        if func:
            item = func(item)
        # Print Each Item
        print("     {0}".format(item))
        count += 1

### Test ###
# test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
# display(test_dict)

## [Method] Display Progress

In [60]:
def report_progress(progress, total, lbar_prefix = '', rbar_prefix=''):
    percent = round(progress / float(total) * 100, 2)
    buf = "{0}|{1}| {2}{3}/{4} {5}% ".format(lbar_prefix, ('#' * round(percent)).ljust(100, '-'),
        rbar_prefix, progress, total, percent)
    sys.stdout.write(buf)
    sys.stdout.write('\r')
    sys.stdout.flush()


def report_progress_done():
    sys.stdout.write('\n')

### TEST ###
# total = 100
# report_progress(0, total)
# for progress in range(1, total + 1):
#     time.sleep(0.1)
#     report_progress(progress, total)
# report_progress_done()

## [Definition] Define Paths
   * Dictionaries: ./Dictionaries [Culture; Demographics; Relational]
   * Articles: ../All_Articles [Part 001-098]

In [61]:
"""
Paths for Dictionaries
"""
dictionary_root = "./Dictionaries"
dictionary_path = {}
dictionary_name_list = [
    "Culture",
    "Demographic",
    "Relational",
]

for dictionary_name in dictionary_name_list:
    dictionary_path[dictionary_name] = join(dictionary_root, dictionary_name + ".csv")

"""
Paths for Articles
"""
articles_root = "../All_Articles"

"""
Paths for Database
"""
db_root = "./"
db_name = "my_result.db"

### TEST ###
display(dictionary_path.items())

[]
     ('Culture', './Dictionaries/Culture.csv')
     ('Demographic', './Dictionaries/Demographic.csv')
     ('Relational', './Dictionaries/Relational.csv')


## [Method] Article Zip File Validation By Filename

In [62]:
""" 
Assert the filename in format "receipt-id-752441-part-XXX.zip"
where XXX stands for article set number
"""
def valid_zip(filename):
    return re.match("^receipt-id-752441-part-.+.zip$", filename)

## [Application] Article Zip File Validation By Filename

In [63]:
"""
Get filtering results
"""
articles_zip_file_list = sorted([filename for filename in os.listdir(articles_root) if valid_zip(filename)])
display(articles_zip_file_list, limit=5)

['articles_zip_file_list']
     receipt-id-752441-part-001.zip
     receipt-id-752441-part-002.zip
     receipt-id-752441-part-003.zip
     receipt-id-752441-part-004.zip
     receipt-id-752441-part-005.zip


## [Method] Read Dictionaries' Content
    * DataFrame: [Subject; N-Gram; Words]

In [64]:
def create_dictionary_dataframe():
    """
    Data
    """
    data = []

    # Iterate Through All Dictionaries
    for subject_path_pair in dictionary_path.items():
        # (Subject, Path) -> ('Culture', './Culture.csv')
        subject, path = subject_path_pair[0], subject_path_pair[1]
        # Iterate Through All Words In The Dictionary
        # Load The .CSV File
        with open(path, encoding='ISO-8859-1') as csv_file:
            # Define A Line In Data -> [subject, n-gram, words]
            dataline = []
            # We Do Not Split In Case When There're Multiple Words In A Row
            # Since We Store Words As One String In DataFrame
            rows = csv.reader(csv_file)
            for row in rows:
                n_number = len(row)
                if n_number <= 0:
                    continue
                words = row[0].strip()
                dataline = [subject, n_number, words]
                data.append(dataline)

    """
    Columns
    """
    columns = ["Subject", "N-Gram", "Words"]

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)
    
    return dataframe

## [Application] Read Dictionaries' Content

In [65]:
dataFrame_dictionary = create_dictionary_dataframe()

### TEST ###
dataFrame_dictionary.head()

Unnamed: 0,Subject,N-Gram,Words
0,Culture,1,ambiguity
1,Culture,1,ambiguous
2,Culture,1,appropriate
3,Culture,1,avoidance inspection
4,Culture,1,bureaucratization


## [Method] Parse Article Set ID

In [66]:
""" Define function getting article set ID by the zip-file-name
Pattern: receipt-id-752441-part-XXX.zip
"""
def parse_article_set_id(filename):
    id_number_lst = re.findall("receipt-id-752441-part-(.+).zip", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
# print(parse_article_set_id("receipt-id-752441-part-000.zip"))

## [Method] Parse Article ID

In [67]:
""" Define function getting article ID by the file-name
Pattern: journal-article-10.2307_00000000-ngram1.txt
"""
def parse_article_id(filename):
    id_number_lst = re.findall("journal-article-(.+)-ngram.+", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
# print(parse_article_set_id("journal-article-10.2307_00000000-ngram1.txt"))

## [Method] Filter Filenames For Test Files 

In [68]:
"""
For every file in ngram1/ folder, check the filename validity, 
extract the article ID, then search if same ID exist in ngram2/3 folders
- Expected filename format: journal-article-10.2307_00000000-ngram1.txt
@return: {article_id : [T/F, T/F, T/F]}
"""
def filter_by_filename(files_list):
    filtered_list = {}
    for filename in files_list:
        assert isinstance(filename, str)
        if filename.startswith("metadata"):
            continue
        # Get n_number
        n_number = int(re.findall("^ngram(.)/", filename)[0])
        # Check if the filename starts with "journal-article"
        filename = filename[len("ngram" + str(n_number) + "/"):]
        if not filename.startswith("journal-article"):
            continue
        # Get article id
        article_id = parse_article_id(filename)
        if article_id in filtered_list.keys():
            filtered_list[article_id][n_number - 1] = True
        else:   
            # Initialize existence
            existence = [False] * 3
            existence[n_number - 1] = True
            filtered_list[article_id] = existence
            
    return filtered_list

## [Method] Check Word Validity For Test Files

In [69]:
# Functions checking word attributes (single-letter, starts/ends with numebr)
def is_single_letter(word):
    assert isinstance(word, str)
    return len(word) <= 1

def starts_with_number(word):
    assert isinstance(word, str)
    try:
        return word[0].isdigit()
    except:
        return False

def ends_with_number(word):
    assert isinstance(word, str)
    try:
        return word[len(word) - 1].isdigit()
    except:
        return False

# Summary of check functions
check_funcs = [
    is_single_letter, 
    starts_with_number, 
    ends_with_number,
]

## [Method] Read N-Gram File And Return Freq List

In [70]:
"""
Read the file without unzipping.
@return: freq_list containing all words (all-n-gram) with corresponding freq
"""
def get_freq_list(n_number, article_id, zip_file):
    
    # Read Without Unzipping
    ngram_type = "ngram" + str(n_number)

    # Expected Path: ngram1/journal-article-10.2307_3110425-ngram1.txt
    article_path = ngram_type + "/"  + "journal-article-" + article_id + "-" + ngram_type + ".txt"
    try:
        article_open = zip_file.open(article_path, mode="r")
    except IOError:
        print("Error opening file {0}".format(articles_path))
        exit(0)

    # Initiate freq_list -> [[words0, freq0], [words1, freq1]]
    freq_list = []

    # Read By Lines
    for line in article_open:
        line = line.decode("utf-8")

        # pair -> "["word1", "word2", "word3", "5"]
        pair = line.strip().split()
        assert len(pair) >= 2

        # Separate word/freq
        words, freq = pair[:-1], pair[-1]
        assert freq.isdigit()
        
        # Words -> "word1 word2 word3"
        words = " ".join(words)

        # Append new pair to freq_list
        freq_list.append([words, freq])

    # Close reading file
    article_open.close()
    
    return freq_list

## [Method] Perform Mapping Process And Return Rate

In [71]:
"""
Mapping words in the freq_list to the dictionaries and get the match rates
@return: [Culture_Rate, Demographic_Rate, Relational_Rate]
"""
def get_mapping_rate(freq_list):
    
    # Initialize the match_counts => {subject : count} ...
    match_counts = {}
    for subject in dictionary_name_list:
        match_counts[subject] = 0
    
    # Iterate Through Each Word In freq_list 
    for words_freq_pair in freq_list:
        words, freq = words_freq_pair
        
        # Identify n_number
        n_number = len(words)
        
        # Check Through Every Subject Dictionary
        for subject in dictionary_name_list:
            selected_dictionary = dataFrame_dictionary[(dataFrame_dictionary['Subject'] == subject) & 
                                          (dataFrame_dictionary['N-Gram'] == n_number)]
            if selected_dictionary['Words'].str.contains(words).any():
                match_counts[subject] += 1
    
    match_rates = [
        match_counts["Culture"] / len(freq_list),
        match_counts["Demographic"] / len(freq_list),
        match_counts["Relational"] / len(freq_list),
    ]

    return match_rates

## [Method] Map Each File's Content To The Dictionary To Calculate The Rate
    * DataFrame [Set_ID; File_ID; N1_Culture; N1_Demographic; N1_Relational; N2_Culture; N2_Demographic; N2_Relational; N3_Culture; N3_Demographic; N3_Relational; Culture_Rate; Demographic_Rate; Relational_Rate; Classification;]

In [85]:
"""
The function runs the mapping algorithm and stores every result into the database
@param break_point: the article_id of the last article the last time we finished 
"""
def create_mapping_operation(break_point=None):
    
    # Connect to the database "map_result.db"
    conn = sqlite3.connect(join(db_root, db_name))
    # Create Cursor object so that we can execute SQL commands
    cur = conn.cursor()
    
    """
    Columns
    """
    columns = ["Set_ID", "File_ID", 
               "N1_Culture", "N1_Demographic", "N1_Relational", 
               "N2_Culture", "N2_Demographic", "N2_Relational", 
               "N3_Culture", "N3_Demographic", "N3_Relational",
               "Culture_Rate", "Demographic_Rate", "Relational_Rate", "Classification"]
    
    """
    Data
    """
    data = []
    
    # Iterate Through All Article Sets
    for article_zip_file_name in articles_zip_file_list:
        
        # Extract Data Set ID
        data_set_id = parse_article_set_id(article_zip_file_name)
        
        # Path Format => "../All_Articles/receipt-id-752441-part-000.zip"
        article_zip_file_path = join(articles_root, article_zip_file_name)
        
        # Read The Zip File Without Unzipping
        zip_file = zipfile.ZipFile(article_zip_file_path)
        file_name_list = zip_file.namelist()
        
        # Filter by filename
        filtered_file_list = filter_by_filename(file_name_list)
        
        # Count Progress
        total_progress = len(filtered_file_list)
        report_progress(0, total_progress)
        count = 0
        
        # Catch Error
        try:
            # Start Working (continue from the last break point)
            start_index = True if break_point is None else False
            
            # Iterate Each Article Through the filtered_file_list
            for article_info in filtered_file_list.items():
                article_id, existence = article_info
                
                # Track Progress
                count += 1
                report_progress(count, total_progress)
                
                # Set a starting article_id
                if not start_index:
                    if article_id == break_point:
                        start_index = True
                    continue

                # Match_Rates => [rates_for_n_1, rates_for_n_2, rates_for_n_3]
                match_rates = [None] * len(existence)

                # Iterate Through N-Gram Folders
                for i in range(len(existence)):
                    n_number = i + 1

                    # If the file exists
                    if existence[i]:
                        freq_list = get_freq_list(n_number, article_id, zip_file)

                        if len(freq_list) >= 1:
                            # [Culture_Rate, Demographic_Rate, Relational_Rate]
                            match_rates[i] = get_mapping_rate(freq_list)
                        else:
                            match_rates[i] = [None] * 3
                    else:
                        match_rates[i] = [None] * 3

                # Add data to dataline
                dataline = [
                    data_set_id, 
                    article_id,
                ]

                # Add N1_Culture; N1_Demographic; N1_Relational; ...
                for match_rate_list in match_rates:
                    for rate in match_rate_list:
                        dataline.append(rate)

                # Add Culture_Rate; Demographic_Rate; Relational_Rate;
                subject_rates = [sum([match_rate_list[i] if match_rate_list[i] is not None else 0 for match_rate_list in match_rates]) for i in range(len(dictionary_name_list))]
                dataline.extend(subject_rates)

                # Add Classification
                dataline.append(dictionary_name_list[subject_rates.index(max(subject_rates))])
                
                # Write the dataline into the database
                assert len(dataline) == len(columns)
                insert_value = "insert into map_result values " \
                    "('{set_id}', '{file_id}', " \
                    "{n1_culture}, {n1_demographic}, {n1_relational}, " \
                    "{n2_culture}, {n2_demographic}, {n2_relational}, " \
                    "{n3_culture}, {n3_demographic}, {n3_relational}, " \
                    "{culture_rate}, {demographic_rate}, {relational_rate}, '{classification}')".format(set_id=dataline[0], file_id=dataline[1], 
                                                                                                       n1_culture=dataline[2], n1_demographic=dataline[3], n1_relational=dataline[4], 
                                                                                                       n2_culture=dataline[5], n2_demographic=dataline[6], n2_relational=dataline[7],
                                                                                                       n3_culture=dataline[8], n3_demographic=dataline[9], n3_relational=dataline[10],
                                                                                                       culture_rate=dataline[11], demographic_rate=dataline[12], relational_rate=dataline[13], classification=dataline[14])
                cur.execute(insert_value)
                conn.commit()
                
                # # Append value to data
                # data.append(dataline)

        except Exception as e:
            print(e)
            cur.close()
            conn.close()
            pass
                
        # Report Progree Done        
        report_progress_done()   
        
        break
    
    # Close the database and cursor
    cur.close()
    conn.close()

#     """
#     Index
#     """
#     index = list(range(len(data)))

#     """
#     DataFrame
#     """
#     dataframe = pd.DataFrame(data, columns=columns, index=index)
    
#     return dataframe

## [Application] Map Each File's Content To The Dictionary To Calculate The Rate

In [None]:
create_mapping_operation(break_point="10.2307_40405624")

|----------------------------------------------------------------------------------------------------| 62/36267 0.17% 

In [None]:
# dataFrame_mapping = create_mapping_dataframe()

### TEST ###
# dataFrame_mapping.head()

## Run This If Empty Database

In [53]:
# Connect to the database "map_result.db"
conn = sqlite3.connect(join(db_root, db_name))
# Create Cursor object so that we can execute SQL commands
cur = conn.cursor()
# Create table
create_table = "create table books (title text, author text, lang text) "
create_table = 'create table if not exists map_result ' \
    '(set_id text, file_id text, ' \
    'n1_culture real, n1_demographic real, n1_relational real, ' \
    'n2_culture real, n2_demographic real, n2_relational real, ' \
    'n3_culture real, n3_demographic real, n3_relational real, ' \
    'culture_rate real, demographic_rate real, relational_rate real, classification text) '
cur.execute(create_table)

# # Insert value
# insert_value = "insert into map_result values " \
#     "('set_id_sample', 'file_id_sample', " \
#     "0.01, 0.01, 0.01, " \
#     "0.02, 0.02, 0.02, " \
#     "0.03, 0.03, 0.03, " \
#     "0.99, 0.99, 0.99, 'culture')"
# cur.execute(insert_value)

# Commit the changes and close
conn.commit()
cur.close()
conn.close()


In [83]:
# Connect to the database "map_result.db"
conn = sqlite3.connect(join(db_root, db_name))
# Create Cursor object so that we can execute SQL commands
cur = conn.cursor()
cur.execute('select * from map_result')
display(cur.fetchall()[-1])
cur.close()
conn.close()

[]
     001
     10.2307_40405624
     0.014072847682119206
     0.0024834437086092716
     0.015728476821192054
     0.0
     0.0
     0.0
     0.0
     0.0
     0.0
     0.014072847682119206
     0.0024834437086092716
     0.015728476821192054
     Relational
