In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import json
from collections import defaultdict
# import plotly.graph_objects as go
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import wordnet 
nltk.download('stopwords')
from nltk.corpus import stopwords
# import spacy

import time
import string


# from sklearn.

# MODELS
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance

# EVALUATION METRICS and TOOLS
from sklearn.metrics import balanced_accuracy_score, classification_report, f1_score, precision_score, recall_score


# OTHER TOOLS
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import  HalvingGridSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Utilities
from utils import main as utils, plots, preprocessing

# To save resutls locally
import pickle

# Needed to retrieve the top feautures after classification.
import spacy
# Load the spaCy model
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

nlp.Defaults.stop_words -= set(preprocessing.POSSIBLY_NEEDED_STOPWORDS)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alhas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Needed Vars and Functions

In [2]:
MODELS = ['LR', 'SVM', 'RF']
ITERATIONS = 10

In [3]:
def save_results(results, file_path):
    """
    Save the results to a file using pickle serialization.

    Args:
        results (object): Results to be saved.
        file_path (str): File path to save the results.
    """
    with open(file_path, 'wb') as f:
        pickle.dump(results, f)


def read_results(file_path):
    """
    Read the results from a file using pickle deserialization.

    Args:
        file_path (str): File path to read the results from.

    Returns:
        object: Results read from the file.
    """
    with open(file_path, 'rb') as f:
        results = pickle.load(f)
    return results


In [4]:
def calculate_metrics(df, models=MODELS):
    """
    Calculate various metrics based on the given DataFrame:
        - Correctly labeld by all models.
        - distribution of feature occurrencs across OCM codes.
        - Top cultures where these features occur.

    Args:
        df (DataFrame): Input DataFrame.
        models (list): List of model names.

    Returns:
        dict: Calculated metrics including predicted model accuracy, OCM ratios, and top cultures.
    """
    return {
        **{f'predicted_{model}': len(df[df['ocms'] == df[f'predicted_{model}']]) / len(df) for model in models},
        **{ocm: v / df.groupby('ocms').size().sum() for ocm, v in df.groupby('ocms').size().to_dict().items()},
        'feature_occurrence_per_ocm': df['ocms'].value_counts().to_dict(),
        'top_cultures': df['culture'].value_counts().head(5).to_dict()
    }


In [22]:
def filter_specific_rows(features_in_text, features, strict=False, within=True, seed=27, to_sample=5):
    """
    Filter specific rows from the input DataFrame based on specified features.

    Args:
        features_in_text (DataFrame): Input DataFrame with features in text.
        features (list): List of features to filter on.
        strict (bool): Flag indicating strict matching of all features.
        within (bool): Flag indicating whether to include rows with the specified features or exclude them.
        seed (int): Random seed for sampling rows.
        to_sample (int): Number of rows to sample.

    Returns:
        dict: Filtered DataFrame and calculated metrics, required features, top cultures, samples and the filtered df.
    """
    if strict:
        filtered = features_in_text[features_in_text['features'].map(lambda x: np.all([f in x if within else not f in x for f in features]))]
    else:
        filtered = features_in_text[features_in_text['features'].map(lambda x: np.any([f in x if within else not f in x for f in features]))]

    top_5_cultures = filtered['culture'].value_counts().head(5)
    filtered_top_cultures = filtered[filtered['culture'].isin(top_5_cultures.index)]

    to_sample = to_sample if len(filtered_top_cultures) >= to_sample else len(filtered_top_cultures)

    return {
        **calculate_metrics(filtered),
        # **get_features_stats(filtered, features),
        'features': features,
        'top_cultures': top_5_cultures.to_dict(),
        'samples': {k: list(v.values()) for k, v in filtered_top_cultures.sample(to_sample, random_state=seed)[['textrecord', 'culture']].to_dict().items()},
        'df': filtered,
    }


# Analysis

### 224 and 226

In [23]:
r = read_results(f'output/results/224_226.pkl')

In [24]:
df_t = r[1]['df']

In [25]:
filter_specific_rows(df_t, ['his', 'game'], strict=True)#.sample(5)

{'predicted_LR': 1.0,
 'predicted_SVM': 1.0,
 'predicted_RF': 1.0,
 '224': 0.9743589743589743,
 '226': 0.02564102564102564,
 'feature_occurrence_per_ocm': {'224': 38, '226': 1},
 'top_cultures': {'Navajo': 5, 'San': 3, 'Miskito': 2, 'Kaska': 2, 'Gond': 2},
 'features': ['his', 'game'],
 'samples': {'textrecord': ['Next the leader instructed the party how to sleep. They were not allowed to lie flat on their backs because the soles of their feet would be toward the fire. The heat from the fire was thought to weaken the hunters and also make the game watchful. Everyone slept on his side with his knees flexed, ‘like the game.’ If a man wished to turn over he assumed a standing position, stretched his legs, and then lay down. ‘If he did not do this he would get sick.’ Sleeping in the correct manner was thought to make it possible for the hunter to approach the game without frightening it. The leader closed the instructions, saying, ‘We want to rise before dawn tomorrow so we can have a litt

In [26]:
_df = filter_specific_rows(df_t, ['his', 'game'], strict=True, to_sample=10,)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'], '\n')
for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['his', 'game']
Feature occurrences:  {'224': 38, '226': 1} 

Culture:  Navajo
Sentence:  Next the leader instructed the party how to sleep. They were not allowed to lie flat on their backs because the soles of their feet would be toward the fire. The heat from the fire was thought to weaken the hunters and also make the game watchful. Everyone slept on his side with his knees flexed, ‘like the game.’ If a man wished to turn over he assumed a standing position, stretched his legs, and then lay down. ‘If he did not do this he would get sick.’ Sleeping in the correct manner was thought to make it possible for the hunter to approach the game without frightening it. The leader closed the instructions, saying, ‘We want to rise before dawn tomorrow so we can have a little ceremony. Try to remember whatever you dream during the night.’ [109]

Culture:  San
Sentence:  To place his arrow effectively, a hunter must hit his prey in a fleshy part where the arrow may penetrate to a depth

In [27]:
df_t = r[3]['df']

In [28]:
filter_specific_rows(df_t, ['woman'], strict=True, to_sample=10)#.sample(5)

{'predicted_LR': 0.8938053097345132,
 'predicted_SVM': 0.8672566371681416,
 'predicted_RF': 0.8495575221238938,
 '224': 0.48672566371681414,
 '226': 0.5132743362831859,
 'feature_occurrence_per_ocm': {'226': 58, '224': 55},
 'top_cultures': {'Copper Inuit': 11,
  'Lau Fijians': 10,
  'Yahgan': 7,
  'Ingalik': 4,
  'Samoans': 4},
 'features': ['woman'],
 'samples': {'textrecord': ["{{224}} My own observations led me to a less favourable conclusion. Ikpakhuak was reputed to be one of the best bowmen in Dolphin and Union strait, and the maximum distance he could send an arrow was about 125 yards. Even at a fixed target his marksmanship was indifferent. During the summer of 1915 the natives set up a clod of earth about a foot square for a target. They went back forty paces and tried their skill, but only about one shot in twenty hit the mark. The men seemed to be no more accurate than the children, though, their bows being stronger, their arrows flew with more velocity. Two of the women jo

In [29]:
_df = filter_specific_rows(df_t, ['woman'], strict=True, to_sample=10)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'], '\n')
for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['woman']
Feature occurrences:  {'226': 58, '224': 55} 

Culture:  Copper Inuit
Sentence:  {{224}} My own observations led me to a less favourable conclusion. Ikpakhuak was reputed to be one of the best bowmen in Dolphin and Union strait, and the maximum distance he could send an arrow was about 125 yards. Even at a fixed target his marksmanship was indifferent. During the summer of 1915 the natives set up a clod of earth about a foot square for a target. They went back forty paces and tried their skill, but only about one shot in twenty hit the mark. The men seemed to be no more accurate than the children, though, their bows being stronger, their arrows flew with more velocity. Two of the women joined in the sport, using their husbands' or their children's bows; they acquitted themselves hardly less creditably than the others. I frequently watched the men shooting at ptarmigan and water-fowl, and without exception their marksmanship was poor. It was no better even with larg

In [30]:
filter_specific_rows(df_t, ['his', 'they'], strict=True, to_sample=10)#.sample(5)

{'predicted_LR': 0.9212598425196851,
 'predicted_SVM': 0.9212598425196851,
 'predicted_RF': 0.889763779527559,
 '224': 0.7559055118110236,
 '226': 0.2440944881889764,
 'feature_occurrence_per_ocm': {'224': 96, '226': 31},
 'top_cultures': {'Navajo': 11,
  'Copper Inuit': 10,
  'Ojibwa': 8,
  'Mescalero Apache': 5,
  'Tiv': 4},
 'features': ['his', 'they'],
 'samples': {'textrecord': ['The party asked the leader what they should do. The leader said to put aside all restrictions for that night and the next morning. In the morning some of the hunters were sent to gather yucca, others to build a sweat house, and still others to collect every variety of grass and brush on which the deer fed. Then rocks were heated and everyone took a sweat bath, while the leader sang. As soon as anyone came out of the sweat house he washed his hair in a solution of the yucca and the plants on which the deer fed. After this the hunters fumigated themselves and their weapons in the smoke of a fire made from g

In [31]:
_df = filter_specific_rows(df_t, ['his', 'they'], strict=True, to_sample=10)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'])

for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['his', 'they']
Feature occurrences:  {'224': 96, '226': 31}
Culture:  Navajo
Sentence:  The party asked the leader what they should do. The leader said to put aside all restrictions for that night and the next morning. In the morning some of the hunters were sent to gather yucca, others to build a sweat house, and still others to collect every variety of grass and brush on which the deer fed. Then rocks were heated and everyone took a sweat bath, while the leader sang. As soon as anyone came out of the sweat house he washed his hair in a solution of the yucca and the plants on which the deer fed. After this the hunters fumigated themselves and their weapons in the smoke of a fire made from goldenrod and other bushes.

Culture:  Ojibwa
Sentence:  {{224}} buffalo, was not only exciting but dangerous. It often happened that the hunter found himself surrounded by the flying herd, and in a cloud of dust, so that neither man nor horse could see the ground before them. Under such 

In [32]:
df_t = r[4]['df']

In [33]:
filter_specific_rows(df_t, ['woman', 'man'], strict=True, to_sample=20)#.sample(5)

{'predicted_LR': 0.8666666666666667,
 'predicted_SVM': 0.7666666666666667,
 'predicted_RF': 0.8333333333333334,
 '224': 0.5666666666666667,
 '226': 0.43333333333333335,
 'feature_occurrence_per_ocm': {'224': 34, '226': 26},
 'top_cultures': {'Copper Inuit': 6,
  'Lau Fijians': 4,
  'Mundurucu': 3,
  'Northern Paiute': 3,
  'Ingalik': 2},
 'features': ['woman', 'man'],
 'samples': {'textrecord': ["~~Manufacture and use~~ . This snare (tether snare) differs from the ptarmigan and grouse snare (which see) only in the fact that the noose line is made of sinew rather than of fish skin, and in the fact that it is set over the animal's hole rather than in a fence. It is most commonly used in the mountains in the spring and fall. It may be set, used, and owned by men or women, but more often the former, as women seldom go into the mountains. One man said that a feather should be tied to the snare for luck.",
   'The downstream scene is always pandemonium. Everybody is usually in the water tryi

In [34]:
_df = filter_specific_rows(df_t, ['woman', 'man'], strict=True, to_sample=20)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'])

for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['woman', 'man']
Feature occurrences:  {'224': 34, '226': 26}
Culture:  Ingalik
Sentence:  ~~Manufacture and use~~ . This snare (tether snare) differs from the ptarmigan and grouse snare (which see) only in the fact that the noose line is made of sinew rather than of fish skin, and in the fact that it is set over the animal's hole rather than in a fence. It is most commonly used in the mountains in the spring and fall. It may be set, used, and owned by men or women, but more often the former, as women seldom go into the mountains. One man said that a feather should be tied to the snare for luck.

Culture:  Mundurucu
Sentence:  The downstream scene is always pandemonium. Everybody is usually in the water trying to catch the fleeing fish, which are already semistupified by the drug. The women use handnets to scoop up the fish while the men impale them with fishing arrows or club them. At large

Culture:  Copper Inuit
Sentence:  women and children in camp the following morning 

### 224, 226 and rest of 220

In [35]:
r = read_results(f'output/results/224_220.pkl')

In [36]:
df_t = r[8]['df']

In [37]:
filter_specific_rows(df_t, ['country', ], strict=True, to_sample=10)#.sample(5)

{'predicted_LR': 0.8536585365853658,
 'predicted_SVM': 0.8414634146341463,
 'predicted_RF': 0.8536585365853658,
 '220': 0.32926829268292684,
 '224': 0.6707317073170732,
 'feature_occurrence_per_ocm': {'224': 55, '220': 27},
 'top_cultures': {'Tarahumara': 7,
  'Ojibwa': 7,
  'Copper Inuit': 6,
  'Assiniboine': 4,
  'Pawnee': 3},
 'features': ['country'],
 'samples': {'textrecord': ['{{221}} “The Eskimo had all summer been making sledges, wooden snow-shovels, bows and spear handles, and other articles of wood. All these things and a good supply of caribou meat were stored at a spot which we called the ‘sled-making place,’ but which the Slaveys of Bear Lake, who know the country well and visit it in winter, call ‘Big Stick Island.’ This is a clump of large spruce trees on the southeast branch of the Dease River. The Eskimo were now waiting for the first snow of the year so they could hitch their dogs to the sleds they had made, load their provisions upon them, and move north toward the c

In [38]:
_df = filter_specific_rows(df_t, ['country'], strict=True, to_sample=20)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'])

for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['country']
Feature occurrences:  {'224': 55, '220': 27}
Culture:  Copper Inuit
Sentence:  {{221}} “The Eskimo had all summer been making sledges, wooden snow-shovels, bows and spear handles, and other articles of wood. All these things and a good supply of caribou meat were stored at a spot which we called the ‘sled-making place,’ but which the Slaveys of Bear Lake, who know the country well and visit it in winter, call ‘Big Stick Island.’ This is a clump of large spruce trees on the southeast branch of the Dease River. The Eskimo were now waiting for the first snow of the year so they could hitch their dogs to the sleds they had made, load their provisions upon them, and move north toward the coast where they expected to spend the winter sealing. But starvation began to threaten, so that finally, on September 25, the last party started toward the coast, carrying their sleds on their backs, for the first snow had not yet fallen.”

Culture:  Pawnee
Sentence:  {{224}} The sco

### 221...226 ~ 225

In [39]:
r = read_results(f'output/results/221..226_balanced.pkl')

In [40]:
df_t = r[2]['df']

In [41]:
filter_specific_rows(df_t, ['camp',], strict=True, to_sample=10)#.sample(5)

{'predicted_LR': 0.7686274509803922,
 'predicted_SVM': 0.7686274509803922,
 'predicted_RF': 0.6941176470588235,
 '221': 0.4588235294117647,
 '222': 0.21568627450980393,
 '223': 0.050980392156862744,
 '224': 0.24313725490196078,
 '226': 0.03137254901960784,
 'feature_occurrence_per_ocm': {'221': 117,
  '224': 62,
  '222': 55,
  '223': 13,
  '226': 8},
 'top_cultures': {'Navajo': 26,
  'San': 22,
  'Ojibwa': 16,
  'Hadza': 15,
  'Copper Inuit': 13},
 'features': ['camp'],
 'samples': {'textrecord': ["feet,” joining and departing camps as a way to settle disputes peacefully, to visit relatives and friends, and to seek better foraging opportunities. Woodburn (1979) emphasizes the minimal role of politics, a pervasive emphasis on actions that give immediate returns rather than delayed benefits, and the generally egalitarian nature of Hadza society (Woodburn 1982). Even amidst the rapid economic development occurring in Tanzania, Woodburn's descriptions still fit the Hadza we worked with ver

In [42]:
_df = filter_specific_rows(df_t, ['camp'], strict=True, to_sample=10)
samples = _df['samples']
print("Features: ", _df['features'])
print("Feature occurrences: ", _df['feature_occurrence_per_ocm'])

for i, sample in enumerate(samples['textrecord']):
    print("Culture: ", samples['culture'][i])
    print("Sentence: ", sample)
    print()

Features:  ['camp']
Feature occurrences:  {'221': 117, '224': 62, '222': 55, '223': 13, '226': 8}
Culture:  Hadza
Sentence:  feet,” joining and departing camps as a way to settle disputes peacefully, to visit relatives and friends, and to seek better foraging opportunities. Woodburn (1979) emphasizes the minimal role of politics, a pervasive emphasis on actions that give immediate returns rather than delayed benefits, and the generally egalitarian nature of Hadza society (Woodburn 1982). Even amidst the rapid economic development occurring in Tanzania, Woodburn's descriptions still fit the Hadza we worked with very well.

Culture:  Ojibwa
Sentence:  By “home base,” we mean a habitation site or area (Zone A, Fig. 6), up to a mile or so in extent (or possibly more), within which camps were occupied each year for periods of time that in total generally exceeded the time spent at any other single site; where occasionally cabins were built (but more typically moss-covered conical lodges) an

In [43]:
filter_specific_rows(df_t, ['camp',], strict=True, to_sample=10)['samples']['textrecord'][-3]#.sample(5)

'Men do not rely on the women to supply them with all the vegetable food that they need. They wander off into the bush individually for a while almost every day to satisfy their hunger. They gather vegetable food only for their own needs and normally bring none back to camp.'