# Mapping Dictionary Features To All Articles

## [Definition] Import Dependencies

In [1]:
import os
from os import listdir
from os.path import isfile, join

import re
import sys
import time
from collections import Counter

import pandas as pd
import numpy as np

import csv
import zipfile

import sqlite3
import pickle

import logging
import datetime
import sys

import multiprocessing

## [Method] Logger Setup

In [22]:
def setup_logger(name):
    now = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    handler = logging.FileHandler('log/{}.log'.format(now), mode='w')
    handler.setFormatter(formatter)

    screen_handler = logging.StreamHandler(stream=sys.stdout)
    screen_handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    logger.addHandler(screen_handler)
    
    return logger

## [Method] Display Fuction

In [23]:
"""
Get name of an object
"""
def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]

"""
Display with format
"""
def display(items, func=None, limit=None):
    # Print Variable Name
    print(namestr(items, globals()))
    # Print Content
    count = 0
    for item in items:
        # Consider Limit
        if limit is not None and count >= limit:
            return
        # Consider Exerted Function
        if func:
            item = func(item)
        # Print Each Item
        print("     {0}".format(item))
        count += 1

### Test ###
# test_dict = {"A": [1, 2, 3], "B": [4, 5, 6]}
# display(test_dict)

## [Method] Display Progress

In [56]:
def report_progress(progress, total, avg_time='', lbar_prefix = '', rbar_prefix=''):
    percent = round(progress / float(total) * 100, 2)
    buf = "{0}|{1}| {2}{3}/{4} {5} [{6}]% ".format(lbar_prefix, ('#' * round(percent)).ljust(100, '-'),
        rbar_prefix, progress, total, percent, avg_time)
    sys.stdout.write(buf)
    sys.stdout.write('\r')
    sys.stdout.flush()



## TEST ###
# total = 100
# report_progress(0, total)
# for progress in range(1, total + 1):
#     time.sleep(0.01)
#     report_progress(progress, total)

## [Definition] Define Paths
   * Dictionaries: ./Dictionaries [Culture; Demographics; Relational]
   * Articles: ../All_Articles [Part 001-098]

In [2]:
"""
Paths for Dictionaries
"""
dictionary_root = "./Dictionaries"
dictionary_path = {}
dictionary_name_list = [
    "Culture",
    "Demographic",
    "Relational",
]

for dictionary_name in dictionary_name_list:
    dictionary_path[dictionary_name] = join(dictionary_root, dictionary_name + ".csv")

"""
Paths for Articles
"""
zip_articles_root = "../Soc_MGT_OB_1980_2018"
extracted_articles_root = "../ExtractedZipFiles"
demo_files_root = "../ExtractedZipFiles/demo"

"""
Paths for Database
"""
db_root = "./"
db_name = "map_result.db"

"""
File ID Iteration
"""
standard_folder = "metadata"
ngram_types = ["ngram1", "ngram2", "ngram3"]

"""
Load file ID differences
"""
with open("./df_diffs_sum.gz", "rb") as df_diffs_file:
    df_diffs_sum = pickle.load(df_diffs_file)


### TEST ###
# display(dictionary_path.items())

## [Method] Article Zip File Validation By Filename

In [27]:
""" 
Assert the filename in format "receipt-id-989431-part-001"
where XXX stands for article set number
"""
def valid_file_set(filename):
    return re.match("^receipt-id-989431-part-(.+)$", filename)

### TEST ###
# print(valid_file_set("receipt-id-989431-part-001")[0])

receipt-id-989431-part-001


## [Application] Article Zip File Validation By Filename

In [28]:
"""
Get filtering results
"""
articles_file_set_list = sorted([filename for filename in os.listdir(extracted_articles_root) if valid_file_set(filename)])
display(articles_file_set_list, limit=5)


['articles_file_set_list']
     receipt-id-989431-part-014


## [Method] Parse Article Set ID

In [29]:
""" Define function getting article set ID by the zip-file-name
Pattern: receipt-id-989431-part-XXX.zip
"""
def parse_article_set_id(filename):
    id_number_lst = re.findall("receipt-id-989431-part-(.+)$", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
# print(parse_article_set_id("receipt-id-989431-part-014"))

014


## [Method] Parse Article Number ID

In [30]:
""" Define function getting article ID by the file-name
Pattern: journal-article-10.1086_210007-ngram1.txt
"""
def parse_article_id(filename):
    id_number_lst = re.findall("journal-article-(.+)-+", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    id_number_lst = re.findall("journal-article-(.+)\.+", filename)
    if len(id_number_lst) == 1:
        return id_number_lst[0]
    print("Parse_ID Error: Filename does not match pattern. ")
    return None

### TEST ###
# print(parse_article_id("journal-article-10.1086_210007-ngram1.txt"))
# print(parse_article_id("journal-article-10.1086_210007.xml"))

10.1086_210007
10.1086_210007


## [Method] Read Dictionaries' Content
    * DataFrame: [Subject; N-Gram; Words]

In [31]:
def create_dictionary_dataframe():
    """
    Data
    """
    data = []

    # Iterate Through All Dictionaries
    for subject_path_pair in dictionary_path.items():
        # (Subject, Path) -> ('Culture', './Culture.csv')
        subject, path = subject_path_pair[0], subject_path_pair[1]
        # Iterate Through All Words In The Dictionary
        # Load The .CSV File
        with open(path, encoding='ISO-8859-1') as csv_file:
            # Define A Line In Data -> [subject, n-gram, words]
            dataline = []
            # We Do Not Split In Case When There're Multiple Words In A Row
            # Since We Store Words As One String In DataFrame
            rows = csv.reader(csv_file)
            for row in rows:
                n_number = len(row)
                if n_number <= 0:
                    continue
                words = " ".join(row)
                ngram_type = "ngram{n_number}".format(n_number=n_number)
                dataline = [subject, ngram_type, words]
                data.append(dataline)

    """
    Columns
    """
    columns = ["Subject", "N-Gram", "Words"]

    """
    Index
    """
    index = list(range(len(data)))

    """
    DataFrame
    """
    dataframe = pd.DataFrame(data, columns=columns, index=index)
    
    return dataframe

## [Application] Read Dictionaries' Content

In [32]:
dataFrame_dictionary = create_dictionary_dataframe()

### TEST ###
# dataFrame_dictionary.head()

Unnamed: 0,Subject,N-Gram,Words
0,Culture,ngram1,ambiguity
1,Culture,ngram1,ambiguous
2,Culture,ngram1,appropriate
3,Culture,ngram2,avoidance inspection
4,Culture,ngram1,bureaucratization


## [Method] Filter Filenames For Test Files 

In [33]:
"""
For every file in ngram1/ folder, check the filename validity, 
extract the article ID, then search if same ID exist in ngram2/3 folders
- Expected filename format: journal-article-10.2307_00000000-ngram1.txt
@return: {article_id : [T/F, T/F, T/F]}
"""
def filter_by_filename(files_list):
    filtered_list = {}
    for filename in files_list:
        assert isinstance(filename, str)
        if filename.startswith("metadata"):
            continue
        # Get n_number
        n_number = int(re.findall("^ngram(.)/", filename)[0])
        # Check if the filename starts with "journal-article"
        filename = filename[len("ngram" + str(n_number) + "/"):]
        if not filename.startswith("journal-article"):
            continue
        # Get article id
        article_id = parse_article_id(filename)
        if article_id in filtered_list.keys():
            filtered_list[article_id][n_number - 1] = True
        else:   
            # Initialize existence
            existence = [False] * 3
            existence[n_number - 1] = True
            filtered_list[article_id] = existence
            
    return filtered_list

## [Method] Check Word Validity For Test Files

In [34]:
# Functions checking word attributes (single-letter, starts/ends with numebr)
def is_single_letter(word):
    assert isinstance(word, str)
    return len(word) <= 1

def starts_with_number(word):
    assert isinstance(word, str)
    try:
        return word[0].isdigit()
    except:
        return False

def ends_with_number(word):
    assert isinstance(word, str)
    try:
        return word[len(word) - 1].isdigit()
    except:
        return False

# Summary of check functions
check_funcs = [
    is_single_letter, 
    starts_with_number, 
    ends_with_number,
]

## [Method] Read N-Gram File And Return Freq List

In [35]:
"""
@return: [["word1 word2 word3", 5], ["word4 word5 word6", 2], ...]
         freq_list containing all words (all-n-gram) with corresponding freq
"""
def get_freq_list(file_path):
    
    with open(file_path, mode="r", encoding="utf-8") as article_open:

        # Initiate freq_list -> [[words0, freq0], [words1, freq1]]
        freq_list = []

        # Read By Lines
        for line in article_open:
            
            # pair -> "["word1", "word2", "word3", "5"]
            pair = line.strip().split()
            assert len(pair) >= 2

            # Separate word/freq
            words, freq = pair[:-1], pair[-1]
            assert freq.isdigit()
            
            check_words = [check_func(word) for word in words for check_func in check_funcs]
            
            if any(check_words):
                continue

            # Words -> "word1 word2 word3"
            words = " ".join(words)

            # Append new pair to freq_list
            freq_list.append([words, freq])
    
    return freq_list

## [Method] Perform Mapping Process And Return Rate

In [36]:
"""
Mapping words in the freq_list to the dictionaries and get the match rates
@return: [Culture_Rate, Demographic_Rate, Relational_Rate]
"""
def get_mapping_rate(file_path, ngram_type):
    """
    :param file_path: complete directory path of the file
    :param ngram_type: "ngram1" /OR/ "ngram2" /OR/ "ngram3"
    :return: Mapping_Rates, Success => [Culture_Rate, Demographic_Rate, Relational_Rate], True
                                    => [None, None, None], False
    """
    
    # If filepath does not exist, return [None, None, None]
    if not isfile(file_path):
        return list([None, None, None]), False
    
    # Get frequent list
    freq_list = get_freq_list(file_path)
    
    # Initialize the match_counts => {"Culture" : 0, ...} 
    match_counts = {}
    for subject in dictionary_name_list:
        match_counts[subject] = 0
    
    # Iterate Through Each Word In freq_list 
    # -> [["word1 word2 word3", freq], ...]
    for words_freq_pair in freq_list:
        words, freq = words_freq_pair
        
        # Check Through Every Subject Dictionary
        for subject in dictionary_name_list:
            selected_dictionary = dataFrame_dictionary[(dataFrame_dictionary['Subject'] == subject) & 
                                          (dataFrame_dictionary['N-Gram'] == ngram_type)]
            if selected_dictionary['Words'].str.contains(words).any():
                match_counts[subject] += 1
    
    match_rates = [
        match_counts["Culture"] / len(freq_list),
        match_counts["Demographic"] / len(freq_list),
        match_counts["Relational"] / len(freq_list),
    ]

    return match_rates, True

## [Method] Map Each File's Content To The Dictionary To Calculate The Rate
    * DataFrame [Set_ID; File_ID; N1_Culture; N1_Demographic; N1_Relational; N2_Culture; N2_Demographic; N2_Relational; N3_Culture; N3_Demographic; N3_Relational; Culture_Rate; Demographic_Rate; Relational_Rate; Classification;]

In [37]:
def manage_file(file_id, file_set_id, file_set_path):

    dataline = []
    dataline.append(file_set_id)
    dataline.append(file_id)

    # Iterate through all folders
    # -> ["ngram1", "ngram2", "ngram3"]
    cdr_rates = list([0, 0, 0])
    for ngram_type in ngram_types:
        folder_path = join(file_set_path, ngram_type)

        file_name = "journal-article-{file_id}-{ngram_type}.txt".format(file_id=file_id, 
                                                                          ngram_type=ngram_type)
        file_path = join(folder_path, file_name)

        # culture_rate, demographic_rate, relational_rate = [0.15, 0.093, 0.125]
        cdr_ngram_rates, success = get_mapping_rate(file_path, ngram_type) 
        if success:
            cdr_rates = [sum(pair) for pair in zip(cdr_rates, cdr_ngram_rates)]
        else:
            logger.warning("[File_Not_Found] => [FileSetID: {file_set_id}], \
            [FileID: {file_id}], [NGramType: {ngram_type}]".format(file_set_id=file_set_id,
                                                                   file_id=file_id,
                                                                   ngram_type=ngram_type))

        dataline.extend(cdr_ngram_rates)
    # Add general prediction probabilities
    dataline.extend(cdr_rates)

    # Get prediction
    prediction = dictionary_name_list[cdr_rates.index(max(cdr_rates))]
    dataline.append(prediction)

    return dataline


In [58]:
"""
The function runs the mapping algorithm and stores every result into the database
@param break_point: the article_id of the last article the last time we finished running the function
"""
def create_mapping_operation(break_point=None):
    
    logger = setup_logger("Mapping")
    
    logger.info("Connecting to DB")
    
    # Connect to the database "map_result.db"
    conn = sqlite3.connect(join(db_root, db_name))
    # Create Cursor object so that we can execute SQL commands
    cur = conn.cursor()
    
    logger.info("Start Iterative Mapping")

    # In case of non-closing database cursor which left db open
    try:
        # Iterate Through All Article Sets 
        # -> ["receipt-id-989431-part-014", "receipt-id-989431-part-013", ...]
        for article_file_set_name in articles_file_set_list:
            
            pool = multiprocessing.Pool(14)

            # Extract Data Set ID
            # -> "014"
            file_set_id = parse_article_set_id(article_file_set_name)
            file_set_path = join(extracted_articles_root, article_file_set_name)
            
            logger.info("Computing File Set: [{file_set_id}]".format(file_set_id=file_set_id))

            # Get file IDs from "metadata" folder
            # -> ["10.1086_210007", "10.1086_1038856", ...]
            file_ids = [parse_article_id(filename) for filename in listdir(join(file_set_path, standard_folder))]
            
            index = 1
            if break_point is not None:
                index = file_ids.index(break_point)
                file_ids = file_ids[index+1 : ]
                
            logger.info("Multi-Processing Files ")
            
            pool = multiprocessing.Pool(8)
            results = [pool.apply_async(manage_file, (file_id, file_set_id, file_set_path)) for file_id in file_ids]
            
            logger.info("Async Running -- To Be Completed")
            
            start = time.time()
            count, total = index, len(file_ids)
            for result in results:
                dataline = result.get()

                # Write the dataline into the database
                insert_value = "insert into map_result values " \
                    "('{set_id}', '{file_id}', " \
                    "{n1_culture}, {n1_demographic}, {n1_relational}, " \
                    "{n2_culture}, {n2_demographic}, {n2_relational}, " \
                    "{n3_culture}, {n3_demographic}, {n3_relational}, " \
                    "{culture_rate}, {demographic_rate}, {relational_rate}, '{classification}')".format(set_id=dataline[0], file_id=dataline[1], 
                                                                                                       n1_culture=dataline[2], n1_demographic=dataline[3], n1_relational=dataline[4], 
                                                                                                       n2_culture=dataline[5], n2_demographic=dataline[6], n2_relational=dataline[7],
                                                                                                       n3_culture=dataline[8], n3_demographic=dataline[9], n3_relational=dataline[10],
                                                                                                       culture_rate=dataline[11], demographic_rate=dataline[12], relational_rate=dataline[13], classification=dataline[14])
                cur.execute(insert_value)
                conn.commit()
                
                # Update Progress
                count += 1
                avg_time = (time.time() - start) / count
                report_progress(count, total, avg_time)

                
    except Exception as e:
        print("Error Message: {}\n".format(e))
        cur.close()
        conn.close()
        return
        
    # Close the database and cursor
    cur.close()
    conn.close()
    
    return

## [Method] Get Breakpoint Article ID

In [59]:
def get_breakpoint_article_id():
    # Connect to the database "map_result.db"
    conn = sqlite3.connect(join(db_root, db_name))
    # Create Cursor object so that we can execute SQL commands
    cur = conn.cursor()
    # Select all data entries from the table 
    cur.execute('SELECT * FROM map_result')
    # Display all data collected
    database_collection = cur.fetchall()
    print("Total Count: {count}".format(count=len(database_collection)))
    # Close the cursor and the database
    cur.close()
    conn.close()
    
    if len(database_collection) > 0:
        display(database_collection[-1])
        breakpoint_id = database_collection[-1][1]
        return breakpoint_id
    return None
        


### TEST ###
# breakpoint_id = get_breakpoint_article_id()

## [Application] Map Each File's Content To The Dictionary To Calculate The Rate

In [60]:
create_mapping_operation(break_point=None)

2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Connecting to DB
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Start Iterative Mapping
2019-02-27 16:51:32 INFO     Computing File Set: [014]
2019-02-27 16:51:32 INFO     Computing File Set: [014]
2019-02-27 16:51:32 INFO     Computing File Set: [014]
2019-02-27 16:51:32 INFO     Computing File

Process ForkPoolWorker-378:
Process ForkPoolWorker-368:
Process ForkPoolWorker-373:
Process ForkPoolWorker-372:
Process ForkPoolWorker-377:
Process ForkPoolWorker-376:
Process ForkPoolWorker-367:
Process ForkPoolWorker-375:
Process ForkPoolWorker-379:
Process ForkPoolWorker-374:
Process ForkPoolWorker-369:
Process ForkPoolWorker-371:
Process ForkPoolWorker-370:
Process ForkPoolWorker-366:


KeyboardInterrupt: 

Process ForkPoolWorker-384:
Process ForkPoolWorker-381:
Process ForkPoolWorker-385:
Process ForkPoolWorker-386:
Process ForkPoolWorker-382:
Process ForkPoolWorker-387:
Process ForkPoolWorker-380:
Traceback (most recent call last):
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/global/software/sl-7.

  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
Traceback (most recent call last):
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessi

  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/site-packages/pandas/core/ops.py", line 887, in wrapper
    res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/site-packages/pandas/core/ops.py", line 879, in wrapper
    res = na_op(values, other)
  File "/global/software/sl-7.x86_64/modules/langs/python/3.6/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "<ipython-input-37-44c5ad2e9a68>", line 18, in manage_file
    cdr_ngram_rates, success = get_mapping_rate(file_path, ngram_type)
  File "<ipython-input-36-ead3459d9c03>", line 33, in get_mapping_rate
    (dataFrame_dictionary['N-Gram'] == ngram_type)]
KeyboardInterrupt
KeyboardInterrupt
  File "<ipython-input-36-ead3459d9c03>", line 34, in get_mapping_rate
    if selected_dictionary['Words'].str.contains(words).any():
  File "/global/software/sl-7.x86_64/modules/langs/python/3

## [BackUp] Un-Parallelized Version

In [None]:
"""
The function runs the mapping algorithm and stores every result into the database
@param break_point: the article_id of the last article the last time we finished running the function
"""
def create_mapping_operation(break_point=None):
    
    logger = setup_logger("Mapping")
    
    logger.info("Connecting to DB")
    
    # Connect to the database "map_result.db"
    conn = sqlite3.connect(join(db_root, db_name))
    # Create Cursor object so that we can execute SQL commands
    cur = conn.cursor()
    
    logger.info("Start Iterative Mapping")

    # In case of non-closing database cursor which left db open
    try:
        # Iterate Through All Article Sets 
        # -> ["receipt-id-989431-part-014", "receipt-id-989431-part-013", ...]
        for article_file_set_name in articles_file_set_list:
            
            pool = multiprocessing.Pool(14)

            # Extract Data Set ID
            # -> "014"
            file_set_id = parse_article_set_id(article_file_set_name)
            file_set_path = join(extracted_articles_root, article_file_set_name)
            
            logger.info("Computing File Set: [{file_set_id}]".format(file_set_id=file_set_id))

            # Get file IDs from "metadata" folder
            # -> ["10.1086_210007", "10.1086_1038856", ...]
            file_ids = [parse_article_id(filename) for filename in listdir(join(file_set_path, standard_folder))]

            # Iterate through file IDs
            count, total = 1, len(file_ids)
            continue_run = False
            for file_id in file_ids:
                
                # For breakpoint continuing
                if not continue_run and break_point is not None:
                    if file_id == break_point:
                        continue_run = True
                    continue

                dataline = []
                dataline.append(file_set_id)
                dataline.append(file_id)

                # Iterate through all folders
                # -> ["ngram1", "ngram2", "ngram3"]
                cdr_rates = list([0, 0, 0])
                for ngram_type in ngram_types:
                    folder_path = join(file_set_path, ngram_type)

                    file_name = "journal-article-{file_id}-{ngram_type}.txt".format(file_id=file_id, 
                                                                                      ngram_type=ngram_type)
                    file_path = join(folder_path, file_name)

                    # culture_rate, demographic_rate, relational_rate = [0.15, 0.093, 0.125]
                    cdr_ngram_rates, success = get_mapping_rate(file_path, ngram_type) 
                    if success:
                        cdr_rates = [sum(pair) for pair in zip(cdr_rates, cdr_ngram_rates)]
                    else:
                        logger.warning("[File_Not_Found] => [FileSetID: {file_set_id}], \
                        [FileID: {file_id}], [NGramType: {ngram_type}]".format(file_set_id=file_set_id,
                                                                               file_id=file_id,
                                                                               ngram_type=ngram_type))

                    dataline.extend(cdr_ngram_rates)
                # Add general prediction probabilities
                dataline.extend(cdr_rates)

                # Get prediction
                prediction = dictionary_name_list[cdr_rates.index(max(cdr_rates))]
                dataline.append(prediction)

                # Write the dataline into the database
                insert_value = "insert into map_result values " \
                    "('{set_id}', '{file_id}', " \
                    "{n1_culture}, {n1_demographic}, {n1_relational}, " \
                    "{n2_culture}, {n2_demographic}, {n2_relational}, " \
                    "{n3_culture}, {n3_demographic}, {n3_relational}, " \
                    "{culture_rate}, {demographic_rate}, {relational_rate}, '{classification}')".format(set_id=dataline[0], file_id=dataline[1], 
                                                                                                       n1_culture=dataline[2], n1_demographic=dataline[3], n1_relational=dataline[4], 
                                                                                                       n2_culture=dataline[5], n2_demographic=dataline[6], n2_relational=dataline[7],
                                                                                                       n3_culture=dataline[8], n3_demographic=dataline[9], n3_relational=dataline[10],
                                                                                                       culture_rate=dataline[11], demographic_rate=dataline[12], relational_rate=dataline[13], classification=dataline[14])
                cur.execute(insert_value)
                conn.commit()
                
                # Update Progress
                count += 1
                report_progress(count, total)
                
    except Exception as e:
        print("Error Message: {}\n".format(e))
        cur.close()
        conn.close()
        return
        
    # Close the database and cursor
    cur.close()
    conn.close()
    
    return