In [1]:
import pandas as pd
import numpy as np
from langdetect import detect

import json
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import seaborn as sns
import re
import math
from collections import defaultdict, Counter

import utils
# from graph import create_graph

In [2]:

# Open a file for reading
with open("../data/labels_per_category.json", "r") as f:
    # Write the dictionary to the file in JSON format
    ocms = json.load(f)

# Params

In [3]:
CHOSEN_CATEGORIES = True
LABELS_NEEDED_BEFORE = 626
ONLY_PARENT_CATEGORY = False
N_TO_SELECT = 10

# Split based on OCMs

In [4]:
df_all = pd.read_csv('../data/en_data.csv', encoding='utf-8')

In [6]:
df = df_all[['sreid', 'srenext', 'textrecord', 'ocms', 'culture', 'place', 'pub.date', 'field.date', 'pub.lang']]

In [7]:
df.head(2)

Unnamed: 0,sreid,srenext,textrecord,ocms,culture,place,pub.date,field.date,pub.lang
0,aa01-000-000178,aa01-000-000186,"Before the 1900s, Koreans lived as subsistence...",241 433,Korea,Korea,,no date,English
1,aa01-001-000341,aa01-001-000357,{{245}} “Korea is richly endowed with fruits o...,245,Korea,Korea,,1890-1905,English


# TODO
- if no OCMs in text, take the ones in the columns with the whole text instead.
- ~~detect EN only one time (use the saved one for now)~~
- plot other interesting columns. If none, remove the rest.

# Preprocessing

- placeholders within {{}}.
- multiple may occur within the pattern.
- if no ocms present in textrecord, get them from the ocsms column.
- anything within [ ] should be excluded, since these are pictures or links.


In [11]:
# CHECK OUT: 310, 340, 400, 520, 570, 580, 870] activities, building structures, machines, recreation, interpersonal relations, marriage, education.
chosen_categories = [140, 150, 160, 170, 180, 190, 200, 220, 240, 260, 360, 420, 430, 590, 620]#, 780, 820]

In [12]:
with open("../data/id_to_label.json", "r") as f:
    # Write the dictionary to the file in JSON format
    id_to_label = json.load(f)

id_to_label = {int(i): l for i, l in id_to_label.items()}
label_to_id = {v: k for k, v in id_to_label.items()}

In [13]:
id_to_category = utils.make_id_category(ocms, label_to_id)

In [14]:
ocms_counts = utils.get_ocm_counts(df, id_to_category if ONLY_PARENT_CATEGORY else None, 
                                   limit=LABELS_NEEDED_BEFORE)

In [15]:
valid_ocms = list(id_to_label.keys())

In [16]:
def top_n_count(ocm_id, n=400):
    top_400 = any([ocm_id == y[0] for y in ocms_counts.most_common(n)])
    x_i = list(map(lambda x:x[0], ocms_counts.most_common(n))).index(ocm_id)

    return top_400, x_i , ocms_counts[ocm_id]

In [19]:
# for ocm_id in chosen_categories:
#     print(f'"{id_to_label[ocm_id].capitalize()}" id({ocm_id}) count{top_n_count(ocm_id)}\n')

In [129]:
# df.iloc[212] # VERY WEIRD!!

In [12]:
df.iloc[31144]

textrecord    in the forest during the summer. During this s...
ocms                                221 233 241 243 244 246 262
culture                                            Montenegrins
place                                     Serbia and Montenegro
pub.date                                                   1983
field.date                                            1964-1966
pub.lang                                                English
Name: 31144, dtype: object

In [153]:
df.iloc[31145]

sreid                                         aw42-001-001104-0
section                                         aw42-001-000481
sectpar                                         aw42-001-000458
sectgpar                                                    NaN
title         The Santal: a tribe in search of a great tradi...
hdoc                                                   aw42-001
culture                                                  Santal
pub.date                                                   1965
field.date                                            1957-1958
byline                                          by Martin Orans
sreprev                                         aw42-001-001104
srenext                                         aw42-001-001120
parent                                          aw42-001-001104
division                                        aw42-001-000447
coverage                                          not specified
place                                   

In [13]:
# df[df['sreid'] == 'aa01-007-002174'].textrecord

KeyError: 'sreid'

In [21]:
# df[df['sreid'] == 'aa01-007-002174-0'].textrecord

379    [%  [caption:  TABLE 29. AVERAGE ACREAGE, PROD...
Name: textrecord, dtype: object

In [14]:
df.iloc[31146]

textrecord    {{231}}  {{233}}  {{221}}  {{423}} Mainly shee...
ocms                                            221 231 233 423
culture                                            Montenegrins
place                                     Serbia and Montenegro
pub.date                                                   1983
field.date                                            1964-1966
pub.lang                                                English
Name: 31146, dtype: object

In [11]:
id_to_label[626]

'social control'

In [103]:
test = df.iloc[135].textrecord  # No ocms in textrecord.
test[:4327]

'[%  [caption:   {{102}}  {{241}}  {{438}} THE LIMIT LINES OF IMPORTANT CROPS. (n. indicates northern and s. southern limit of the Crop named) [/caption]  [graphic: b7006006]  [/graphic] %]'

In [54]:
df.iloc[170].textrecord

'{{228}}  {{433}} waters are used for other than food purposes, while 91 per cent of the catch of ~~iwashi~~ in 1937 was processed further, chiefly into oil and ~~iwashi~~ cakes. ~~Iwashi~~ oil is used in the production of hard oils, glycerine, fatty acids, gunpowder, medicines, soap, candles, and margarine, while the cake is used for fertilizer. [^^   ~~Chosen Keizai Nempo,~~ 1939, p. 183 ~~et seq.~~  ^^]'

In [20]:
def get_parent_category_i(cat):
    return math.floor(int(cat) / 10) * 10

In [58]:
def split_data(row):
    text = row['textrecord']
    # print(text)
    text = re.sub(r"\[%.*?%\]", "", text) # exclude text within [%  %]
    
    text = re.sub(r"\[//.*//\]", "", text) # exclude text within [//  //] which gives weird, unstractured info
    
    # text = re.sub(r"\[\^\^.*?\^\^\]", "", text) # exclude text within [^^  ^^] which gives extra info
    text = re.sub(r"\[\^\^|\^\^\]", "", text) # exclude text within [^^  ^^] which gives extra info
    
    text = re.sub(r"\[(c|\/c|r|\/r)\]", "", text) # special case for c, /c, r and /r
    
    text = re.sub(r"\[table .*?\]", "", text) # special case for table
    text = re.sub(r"\[\/table]", "", text) # special case for c, /table
    text = re.sub(r"\~~United\~~", "", text) # exclude text within ~~  ~~ since it gives names that are irrelevant.
    # print(text)

    sentences = re.split(r'\{\{(\d+(\s+\d+)*)\}\}', text.strip())

    # print(text, sentences)
    filtered_ocms = []
    filtered_sentences = []
    used_per_sentence = defaultdict(list)
    current_labels = []
    for i, s in enumerate(sentences):
        if s and s.strip() != '':
            s = s.strip()
            all_cat = all([re.fullmatch(r"\d+", c) for c in s.split()])
            # if re.match(r"\d+", s):
            if all_cat:
                for cat in s.split():
                    # print(used_per_sentence.get(len(filtered_sentences)))
                    already_used = used_per_sentence.get(len(filtered_sentences))
                    if not already_used or (cat not in already_used):
                        # print(get_parent_category_i(cat))
                        current_labels.append(get_parent_category_i(cat) if ONLY_PARENT_CATEGORY else int(cat))
                        # current_labels.append(get_parent_category_i(cat))
                        # print(s, current_labels, cat)
                        # filtered_ocms.append((len(filtered_sentences), get_parent_category_i(cat) if ONLY_PARENT_CATEGORY else cat))
                        used_per_sentence[len(filtered_sentences)] = cat
            else:
                # group 
                filtered_sentences.append(s)
                filtered_ocms.append(current_labels)
                current_labels = []
    
    
    if filtered_sentences and not filtered_ocms[0]:
        filtered_ocms = [list(map(lambda x: int(x), row['ocms'].split()))]
        # print(len(filtered_sentences), len(filtered_ocms))
        if len(filtered_sentences) != len(filtered_ocms):
            filtered_sentences = [' '.join(filtered_sentences)]
            # print(row)
    # print(row)
    
    # filtered_ocms = validate_ocm(filtered_ocms)
    # print(filtered_ocms)
    return [{**row, 
             'textrecord': t, 
             'ocms': ocm, 'count': ocms_counts[ocm], 'label_name': id_to_label[ocm],
             'parent_ocms': get_parent_category_i(ocm), 'parent_label_name': id_to_label[get_parent_category_i(ocm)]} 
            for i, t in enumerate(filtered_sentences)
            for ocm in filtered_ocms[i] 
            # for i, ocm in  enumerate(filtered_ocms) 
            # for t in filtered_sentences[i]
            if ocm <= LABELS_NEEDED_BEFORE and
            ocm in valid_ocms # whether it's valid ocm code or not.
           ]#if CHOSEN_CATEGORIES and ocm in chosen_categories]


In [22]:
# 29: [//  [//  {{233}}  {{839}}  {{226}}  {{857}} Fast asleep in my grass roof, //]  [// The birds sing me awake. ... //]  [// Behind those plum flowers the raindrops shine, //]  [// The sun begins to sink. //]  [// Gar[unknown] on, bring fishing rods quick! //]  [// It gets late to fish.” //]  [//  //]  //]
# 170: .... [^^   ~~Chosen Keizai Nempo,~~ 1939, p. 183 ~~et seq.~~  ^^]
# 130: [r]  [c]   [/c]  [c]   [/c]  [c]   [/c]  [c]   ~~United~~  [/c]  [c]   [/c]  [c]   [/c]  [c]   ~~United~~  [/c]  [/r]
# 796:  ... {{013} ...


In [123]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [24]:
all_data = []
for item in df.apply(split_data, axis=1):
    if item:
        [all_data.append(i) for i in item];

In [175]:
all_data

[]

In [25]:
splitted_df = pd.DataFrame(all_data)

In [26]:
print(f'Total of {splitted_df.shape[0] - df.shape[0]} was added due to splitting by a single label')

Total of 455660 was added due to splitting by a single label


In [27]:
splitted_df.to_csv('../data/splitted_data.csv', index=False)

In [202]:
splitted_df[splitted_df.sreid == 'ac07-001-010968']

Unnamed: 0,sreid,srenext,textrecord,ocms,culture,place,pub.date,field.date,pub.lang,count,label_name,parent_ocms,parent_label_name
12594,ac07-001-010968,ac07-001-010978,"In an agricultural community, the first son re...",226,Okinawans,"villages of Hanashiro, Minatogawa, and Matsuda...",1953,1951-1952,English,21697,fishing,220,food quest
12595,ac07-001-010968,ac07-001-010978,"In an agricultural community, the first son re...",593,Okinawans,"villages of Hanashiro, Minatogawa, and Matsuda...",1953,1951-1952,English,2558,family relationships,590,family
12596,ac07-001-010968,ac07-001-010978,have two members of the same immediate family ...,183,Okinawans,"villages of Hanashiro, Minatogawa, and Matsuda...",1953,1951-1952,English,576,norms,180,total culture


In [200]:
df.iloc[3257].textrecord

'{{226 593}} In an agricultural community, the first son remains in the home working the fields of his father. In fishing, through long custom, there is an avoidance of teamwork among close relatives. Members of the immediate family will not go out on the same fishing boat. A father will not take his son, an elder brother will not take a younger. Partners and working teams on boats are more distant relatives or friends. If a father and a son are on the same boat and it is lost, the present and the succeeding head both perish. To this incontrovertible possibility, there has been added the belief that if two persons of the same immediate family go to sea together on the same boat, there will be a disaster. And actual and near disasters are frequent enough to make both points of view understandable. The sons of fishermen, who are learning the ways of the fish and of the sea, go on the boats of others. Many younger fishermen regard the belief that it is inviting disaster to {{183}} have tw

In [166]:
# Basic statistics of the numerical columns
# print(df.describe())

# # Basic statistics of the non-numerical columns
# print('\nnon-numerical columns: \n', df.describe(include=['O']))

# # Count of unique values in the non-numerical columns
# print(df.nunique())

In [25]:

# # Convert the pub.date column to numeric
# df['pub.date'] = pd.to_numeric(df['pub.date'], errors='coerce')

# # Convert the field.date column to numeric
# df['field.date'] = pd.to_numeric(df['field.date'], errors='coerce')
