# batch runner of noun classification task
develop by Kow Kuroda (kow.kuroda@gmail.com)
created on 2024/12/15
modified on 2025/01/14

requirements
- Python 3.11 or later is require to run NN analysis

In [39]:
#!pip install -U scikit-learn
#!conda update conda -y

In [40]:
## general
from typing import TextIO # for type annotation of file object
## global
import utils

In [41]:
## functions for create_df
def process_ske_lines (lines, form_dict_raw, form_counter, l_splitter = ",", f_splitter = ",", tag_cleaner = r'[*"]', word_reg = r"\w+", check: bool = False):
    "process Sketch Engine sample data to get token/pos pairs"
    import re
    counter = 0
    for i, line in enumerate (lines):
        if check:
            print(f"line {i}")
        fields = line.split(l_splitter)
        if check:
            print(f"len(fields): {len(fields)}")
        if len (fields) < 6:
            continue
        ## main
        for field in fields:
            blocks = field.split (f_splitter)
            for block in blocks:
                if check:
                    print (f"block: {block}")
                try:
                    tokens = block.split ()
                    for token in tokens:
                        form, tag = token.split("/")
                        form      = form.strip()
                        ## process POS tag
                        tag       = tag.strip()
                        tag       = re.sub (tag_cleaner, '', tag)
                        if re.match (word_reg, form):
                            form_counter [form] += 1
                            ##
                            counter += 1
                            if check:
                                print (f"form {counter} <{form}> with tag <{tag}> registered")
                            if tag not in form_dict_raw [form]: 
                                form_dict_raw [form].append (tag)
                except ValueError:
                    pass

##
def parse_pos (tag: str, lang: str, i: int, check: bool = False):
    """analyze POS tags"""
    import re
    ## Czech
    if lang in [ 'Czech' ]:
        L = []; y = []
        for i, x in enumerate ([ x for x in re.split(r"", tag) if len(x) > 0 ]):
            if i % 2 == 1:
                y.append(x)
                L.append ("".join(y))
                y = []
            else:
                y.append(x)
    ## Irish, French
    elif lang in [ 'Irish', 'French' ]:
        L = [ x for x in re.split(r"", tag) if len(x) > 0 ]
            
    ## German
    elif lang in [ 'German' ]:
        L = [ x for x in tag.split(".") if len(x) > 0 ]
    
    ## Other
    else:
        L = [ x for x in tag.split(",") if len(x) > 0 ]
    ##
    if check:
        print(f"L: {L}")
    return utils.simplify (L, nested = False)


In [42]:
## POS mapping
#
Czech_pos_renamer = { 'k1' : "Noun", 'k2' : 'Adj', 'k3' : 'Pron', 'k4' : 'Number', 'k5' : 'Verb',  'k6' : 'Adv', 'k7' : 'Prep', 'k8' : 'Conj', 'k9' : 'Part', 'k0' : 'Inter', 'kA' : 'Abbrev', 'kI' : 'Punct',
					 'gF' : 'Fem', 'gM': 'Masc0', 'gI': 'Masc1', 'gN': 'Neut',
					 'nS' : 'Sg', 'nP' : 'Pl',
					 'c1': "Nom", 'c2': 'Gen', 'c3': 'Dat', 'c4': 'Acc', 'c5': 'Voc', 'c6': 'Loc', 'c7' : 'Instr' }
#
German_pos_renamer = { 'N': "Noun", 'PRO': 'Pron', 'V': 'Verb', 'ADJA': 'Adj', 'R': 'Adv', 'CONJ': 'Conj' }
#
Irish_pos_renamer = { 'N': "Noun", 'V': 'Verb', 'A': 'Adj', 'R': 'Adv', 'P': 'Pron', 'f': 'Fem', 'm': "Masc", 's': 'Sg', 'p': 'Pl', 'v': 'Voc', 'g': "Gen", 'd': 'Dat', 'c': 'Nom', '-': '-', 'e': 'Emp' }

def pos_mapper (pos_map: dict, x: str):
	try:
		return pos_map [x]
	except KeyError:
		return x

## French
def French_pos_analyzer (L: list, null: str = 'x'):
	T = []
	sig = L[0]
	if sig == 'N':
		for i, x in enumerate (L):
			if   i == 0:
				T.append ('Noun')
			## process substantive encoding
			elif i == 1:
				if   x == 'C':
					T.append ('common')
				elif x == 'P':
					T.append ('proper')
				else:
					pass
			elif i == 2:
				if   x == 'F':
					T.append ('Fem')
				elif x == 'M':
					T.append ('Masc')
				elif x == 'C':
					T.append ('Comm')
				elif x == 'N':
					T.append ('Neu')
				else:
					pass
			elif i == 3:
				if   x == 'S':
					T.append ('Sg')
				elif x == 'P':
					T.append ('Pl')
				elif x == 'N':
					T.append ('Inv')
				else:
					pass
			elif i == 4:
				pass
			elif i == 5:
				pass
			elif i == 6:
				if   x == 'A':
					T.append ('Aug')
				elif x == 'D':
					T.append ('Dim')
				else:
					pass
			else:
				pass

	elif sig == 'A':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Adj')
			else:
				pass
	elif sig == 'Adp':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Adj')
			else:
				pass
	elif sig == 'C':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Conj')
			else:
				pass
	elif sig == 'D':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Det')
			else:
				pass
	elif sig == 'I':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Int')
			else:
				pass
	elif sig == 'P':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Prep')
			else:
				pass
	elif sig == 'S':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Adp')
			else:
				pass
	elif sig == 'R':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Adv')
			else:
				pass
	elif sig == 'V':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Verb')
			else:
				pass
	elif sig == 'Z':
		for i, x in enumerate (L):
			if i == 0:
				T.append ('Num')
			else:
				pass
	else:
		pass

	##
	return T

##
def encode_attributes (D: list, A: list, check: bool = False) -> dict:
    import collections
    M = collections.defaultdict(bool)
    for a in A:
        if a in D:
            M[a] = 1
        else:
            M[a] = 0
    ##
    return M	

In [43]:
def create_df (target_lang, max_doc_length, min_doc_length, sample_n, uncapitalize: bool = True, check: bool = True):
    
    ## process files
    import glob
    import pprint
    data_dir = target_lang
    data_files = glob.glob(f"data/inflected/{data_dir}/*")
    data_files = sorted([ file for file in data_files if ".csv" in file ])
    pprint.pprint(data_files)
    
    ## process words in data
    import io, re
    import collections
    form_counter  = collections.defaultdict(int)
    form_dict_raw = collections.defaultdict(list)
    check = True
    for file in data_files:
        print(f"opening {file}")
        with io.open(file, encoding = 'utf-8_sig') as word:
            lines = word.readlines()
            process_ske_lines (lines, form_dict_raw = form_dict_raw, form_counter = form_counter)
    #form_dict_raw

    ## regularize Irish dict
    if target_lang == 'Irish':
        D = {}
        for k, vs in form_dict_raw.items():
            V = []
            for v in vs:
                W = []
                for x in v.split("|"):
                    W.append(x)
            V.extend(W)
            D[k] = V
        ##
        if check:
            for k, v in D.items():
                print(f"{k} : {v}")
        ##
        form_dict_raw = D
    
    ## filter out offensive words
    remove_pat = r'[.-]'
    form_dict_raw = { k : v for k, v in form_dict_raw.items() if not re.match(remove_pat, k) }
    form_dict_raw

    ## segement POS tags and define form_dict
    import collections
    form_dict = collections.defaultdict(list)
    check = False
    for form, tags in form_dict_raw.items():
        A = []
        for i, tag in enumerate(tags):
            if check:
                print(f"tag: {tag}")
            a = parse_pos (tag, target_lang, i)
            if a not in A:
                A.append (a)
        #
        form_dict[form] = simplify (A, nested = False)
    ##
    #form_dict

    ## convert POS tag
    if target_lang in [ 'Czech', 'Irish', 'French' ]:
        form_dict_new = {}
        for word, tags in form_dict.items():
            if check:
                print(f"word: {word}; tags: {tags}")
            T = []
            if target_lang in ['French']:
                for tag in tags:
                    if check:
                        print(f"tag: {tag}")
                    X = French_pos_analyzer (tag)
                    if check:
                        print (f"X: {X}")
                    ##
                    T.append (X)
            else:
                for tag in tags:
                    if check:
                        print(f"tag: {tag}")
                    X = []
                    for seg in tag:
                        if target_lang == 'Czech':
                            #pos_new = rename_Czech_pos (seg)
                            pos_new = pos_mapper (Czech_pos_renamer, seg)
                            
                        elif target_lang == 'German':
                            #pos_new = rename_German_pos (seg)
                            pos_new = pos_mapper (German_pos_renamer, seg)
                        
                        elif target_lang == 'Irish':
                            #pos_new = rename_Irish_pos (seg)
                            pos_new = pos_mapper (Irish_pos_renamer, seg)
                        #
                        X.append (pos_new)
                    ## Czech
                    if target_lang == 'Czech':
                        X = [ t for t in X if t in Czech_pos_renamer.values() ]
                    ## German
                    elif target_lang == 'German':
                        X = [ t for t in X if t in German_pos_renamer.values() ]
                    ## Irish
                    elif target_lang == 'Irish':
                        X = [ t for t in X if t in Irish_pos_renamer.values() ]
                    if check:
                        print (f"tag*: {X}")
                    ##
                    T.append(X)
            ##
            form_dict_new[word] = T
        ##
        form_dict = form_dict_new
    ##
    #form_dict
    
    ## define inflect_dict
    inflect_dict = {}
    if target_lang in [ 'German' ]:
        pos_list = [ 'N', 'A', 'V', 'Pro' ]
    elif target_lang in [ 'Irish' ]:
        pos_list = [ 'N', 'A', 'V', 'Pro' ]
        pos_list_x = pos_list + [ 'R', 'D', 'Q', 'S', 'T', 'C', 'M' ]
    else:
        pos_list = [ 'N', 'A', 'V', 'Pro' ]
    print(f"pos_list: {pos_list}")
    ##
    for pos in pos_list:
        print(f"processing: {pos}")
        X_form_dict = {}
        pos_pat = f'r"{pos}.*"'
        #X_form_dict = { k: v for k, v in form_dict.items() if isinstance(x, list) and re.match(eval(pos_pat), v[0][0]) }
        for k, v in form_dict.items():
            try:
                m = re.match(eval(pos_pat), v[0][0])
                X_form_dict[k] = v
            except IndexError:
                pass   
        inflect_dict[pos] = X_form_dict

    ## pos-wise dicts
    N_inflect_dict_all   = inflect_dict['N']
    A_inflect_dict_all   = inflect_dict['A']
    V_inflect_dict_all   = inflect_dict['V']
    Pro_inflect_dict_all = inflect_dict['Pro']
    #X_inflect_dict_all   = inflect_dict['X'] 

    ## create N_attribute_dict
    import collections
    N_inflect_dict   = collections.defaultdict(list)
    N_attribute_dict = collections.defaultdict(int)
    for k, vx in N_inflect_dict_all.items():
        for vs in vx:
            if check:
                print(f"vs: {vs}")
            ## filtering out irrelevant cases: non nouns and proper nouns
            if target_lang in ['German']:
                if vs[0] != 'N' or vs[1] != 'Reg':
                    continue
                else:
                    print(f"processing: {vs}")
                    for v in vs:
                        N_attribute_dict[v] += 1
                    N_inflect_dict[k] = vs
            elif target_lang in ['French']:
                if vs[0] != 'Noun' or vs[1] != 'common':
                    continue
                else:
                    print(f"processing: {vs}")
                    for v in vs:
                        N_attribute_dict[v] += 1
                    N_inflect_dict[k] = vs
            else:
                try:
                    if vs[0] != 'Noun':
                        continue
                    else:
                        print(f"processing: {vs}")
                        for v in vs:
                            N_attribute_dict[v] += 1
                        N_inflect_dict[k] = vs
                except IndexError:
                    pass
    ##
    #N_attribute_dict

    ## get all attributes
    N_attributes_all = list(N_attribute_dict.keys())
    ## select effective attributes
    if target_lang in [ 'Czech' ]:
        gender_index    = [4,6,9,1]
        plurality_index = [2,10]
        case_index      = [5,8,7,12,3,11,13] # Nominative, Accusative, Dative, Genitiv

    elif target_lang in [ 'French' ]:
        gender_index    = [4,2,6]
        plurality_index = [7,5,3]
        case_index      = []
        
    elif target_lang in [ 'German' ]:
        gender_index    = [8,4,10]
        plurality_index = [3,9]
        case_index      = [5,2,7,6] # Nominative, Accusative, Dative, Genitiv

    elif target_lang in [ 'Irish' ]:
        gender_index    = [7,4]
        plurality_index = [1,5]
        case_index      = [3,6,9,8] # Nom=Acc, Genitive, Dative, Vocative

    #
    effective_index = gender_index + plurality_index + case_index
    if check:
        print(f"effective_index: {effective_index}")
    ##
    N_attributes = [ N_attributes_all[i] for i in effective_index ]
    #N_attributes

    ## create encoded_N_inflect_dict
    encoded_N_inflect_dict = collections.defaultdict(list)
    if target_lang in ['German']:
        min_size = 5
    else:
        min_size = 8
    #
    for k, vx in N_inflect_dict.items():
        print(f"processing: {k}; {vx}")
        ## fail-safe operation
        if len(vx) > min_size:
            for vs in vx:
                encoding = encode_attributes (vs, N_attributes)
                if check:
                    print(f"encoding: {encoding}")
                if len([ v for v in encoding.values() if v == True ]) > 0:
                    encoded_N_inflect_dict[k].append(encoding)
                else:
                    print(f"{k} failed encoding: {encoding}")
        ## normal operation
        else:
            encoding = encode_attributes (vx, N_attributes)
            if check:
                    print(f"encoding: {encoding}")
            if len([ v for v in encoding.values() if v == True ]) > 0:
                encoded_N_inflect_dict[k].append(encoding)
            else:
                print(f"{k} failed encoding: {encoding}")
    ##
    #encoded_N_inflect_dict
    ##
    import pandas as pd
    full_df = pd.DataFrame()
    if target_lang in ['German']:
        min_size = 5
    else:
        min_size = 8
    #
    for k, vx in encoded_N_inflect_dict.items():
        if len(vs) > min_size:
            for vs in vx:
                dfx = pd.DataFrame(data = vs)
                full_df = pd.concat([full_df, dfx], ignore_index = True) # Crucially, ignore_index
        else:
            dfx = pd.DataFrame(data = vx)
            full_df = pd.concat([full_df, dfx], ignore_index = True)
    ##
    ## Czech case merger
    merge_cases = True
    if target_lang in ['Czech'] and merge_cases:
        full_df.insert(loc = 3, column = 'Masc', value = full_df['Masc0'] + full_df['Masc1'])

    ## add form column
    full_df['form'] = encoded_N_inflect_dict.keys()

    ## remove too long and too short words
    import unicodedata
    full_df['size'] = full_df['form'].apply(lambda x: len(unicodedata.normalize('NFC', x)))
    full_df = full_df[ full_df['size'] <= max_doc_length ]
    full_df = full_df[ full_df['size'] >= min_doc_length ]
    ## sampling
    df = full_df.sample (sample_n)
    
    ## uncapitalize
    if uncapitalize:
        df.loc[:,'form'] = df['form'].apply(lambda x: str(x).lower())
    ##
    return df, N_attributes

In [44]:
def add_boundary_symbols (df, mark_end: bool = True, mark_start: bool = True, check: bool = True):
    ## add boundary symbols
    if mark_end or mark_start:
        mark_boundaries  = True
    else:
        mark_boundaries  = False
    ###
    if mark_boundaries:
        if mark_end and mark_start:
            hash_status  = "-hash-at-both"

        elif mark_end and not mark_start:
            hash_status  = "-hash-at-end"

        else:
            hash_status  = "-hashed-at-start"
    else:
        hash_status      = "-no-hash"
    print(f"hash_status: {hash_status}")
    ##
    if mark_boundaries:
        ## avoid re-adding hashes
        hashed_test = df['form'].apply(lambda x: str(x)[0] == '#' and str(x)[-1] == "#")
        print(f"hashed_test: {all (hashed_test == True)}")
        if any (hashed_test == True):
            df.loc[:,'form'] = df['form'].apply(lambda x: x.strip('#'))
            if mark_end and mark_start:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"#{str(x)}#")
            elif mark_end:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"{str(x)}#")
            elif mark_start:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"#{str(x)}")
            else:
                df.loc[:,'form'] = df['form'].apply(lambda x: x.strip('#'))
        else:
            if mark_end and mark_start:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"#{x}#")
            elif mark_end and not mark_start:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"{x}#")
            elif not mark_end and mark_start:
                df.loc[:,'form'] = df['form'].apply(lambda x: f"#{x}")
            else:
                pass
    else:
        df.loc[:,'form'] = df['form'].apply(lambda x: x.strip('#'))

    ## check
    #df['form']
    return df

In [45]:
def add_ngram_to_df (dfx, n_for_ngram: int, prefix: str = "", skippy: bool = False, skippy_means_extended: bool = False, seg_joint: str = "", gap_mark: str = "…", max_distance = None, ngram_is_inclusive: bool = True, check: bool = False):
    """
    generic function for adding n-gram column to df with a specified n for ngram
    """
    import numpy as np
    ## set source_var
    source_var = f"{prefix}1gram"
    print(f"===================")
    print(f"source_var: {source_var}")
    unigrams = list(dfx[source_var]) # Crucially
    
    ## set target_var
    if skippy:
        if n_for_ngram > 1:
            target_var = f"{prefix}skippy{n_for_ngram}gram"
        else:
            target_var = f"{prefix}{n_for_ngram}gram"
    else:
        target_var = f"{prefix}{n_for_ngram}gram"
    print(f"target_var: {target_var}")
    
    ## (skippy) n-gram の生成
    import gen_ngrams
    if skippy:
        if skippy_means_extended:
            ngrams_inner = [ gen_ngrams.gen_extended_skippy_ngrams(x, n = n_for_ngram, sep = seg_joint, missing_mark = gap_mark, max_distance = max_distance, check = False) for x in unigrams ]
        else:
            ngrams_inner = [ gen_ngrams.gen_skippy_ngrams(x, n = n_for_ngram, sep = seg_joint, missing_mark = gap_mark, max_distance = max_distance, check = False) for x in unigrams ]

    else:
        ngrams_inner = [ gen_ngrams.gen_ngrams(x, n = n_for_ngram, sep = seg_joint, check = False) for x in unigrams ]
    if check:
        print(f"ngrams: {ngrams_inner}")
    
    ## 包括的 ngramの生成
    if ngram_is_inclusive:
        if skippy and n_for_ngram > 2:
            supplement_var = f"{prefix}skippy{n_for_ngram - 1}gram"
        else:
            supplement_var = f"{prefix}{n_for_ngram - 1}gram"
        print(f"supplement_var: {supplement_var}")
        ##
        for i, g in enumerate(ngrams_inner):
            supplement = [ x for x in list(dfx[supplement_var])[i] if gen_ngrams.skippy_ngram_size(x) < n_for_ngram and x not in g ]
            if check:
                print(f"supplements: {supplement}")
            if len(supplement) > 0:
                g.extend(supplement)
    
    ## 変数の追加
    #dfx.loc[:,target_var] = ngrams_inner
    dfx[target_var] = ngrams_inner
    
    ## check result
    print(dfx[target_var])

In [46]:
def build_bots (df: object, sample_n: int, hashedness: str, term_type: str, max_gap_val: int = None, ngram_is_inclusive: bool = True, skippy_means_extended: bool = False, boundary_symbol: str = '#', gap_mark: str = "…", reduce_DTM: bool = True, conservative: bool = True, check: bool = True):

    print(f"ngram_is_inclusive: {ngram_is_inclusive}")
    print(f"reduce_DTM: {reduce_DTM}")

    import math, re
    min_term_freq         = round (math.sqrt(sample_n)/10) + 1
    print(f"min_term_freq: {min_term_freq}")

    import unicodedata
    if conservative:
        df['1gram'] = [ [ y for y in unicodedata.normalize('NFC', x) if len(y) > 0 ] for x in df['form'] ]
    else:
        ## The code above turned out to be ineffective since it separates diacritics
        df.loc[:,'1gram'] = [ [ *unicodedata.normalize('NFC', x) ] for x in df['form'] ]
    #
    #df['1gram']
    
    ## add boundaries
    if hashedness == 'hashed-at-both':
        df.loc[:,'1gram'] = df['1gram'].apply(lambda x: [boundary_symbol] + x + [boundary_symbol] )
        
    ##
    for i in range(2, 6):
        print(f"adding {i}-gram column")
        add_ngram_to_df (df, n_for_ngram = i, skippy = False, check = False)
        add_ngram_to_df (df, n_for_ngram = i, skippy = True, skippy_means_extended = skippy_means_extended, max_distance = max_gap_val, check = False)
    ##
    return df[term_type]

In [47]:
def build_dtm (bots, reduce_DTM: bool = True, use_numerical_encoding: bool = True, check: bool = False):
    ##
    df_original_index = bots.index
    ## get term list
    from collections import defaultdict
    term_dict = defaultdict(int)
    for bot in bots:
        for term in bot:
            term_dict[term] += 1
    terms = sorted (list(term_dict.keys()), key = lambda x: len(x), reverse = True)
    ## build DTM: takes a few minutes to generate skippy{4,5}gram
    import numpy as np
    import multiprocess as mp
    import os
    from itertools import product
    pool = mp.Pool(max(os.cpu_count(), 1))
    R = pool.starmap(lambda t, b: any(list(map(lambda x: t in x, b))), product (terms, bots))
    ## reshape R for DataFrame creation
    L = np.reshape(np.array(R), (len(terms), -1))
    
    ## create DataFrame
    import pandas as pd
    #dtm_df = pd.DataFrame(L, index = df_original_index).T # fails
    dtm_df = pd.DataFrame(L, index = terms).T # transposition needed
    ## convert values
    if use_numerical_encoding:
        dtm_df = dtm_df.apply(lambda x: x.map ({True: 1, False: 0}), axis = 1)
    ##
    ## recover original index
    dtm_df = dtm_df.set_index (df_original_index)
    
    ##
    return dtm_df

In [48]:
def reduce_DTM (dtm, min_term_freq):
    
    print(f"reducing DTM by filtering terms with frequency less than {min_term_freq}")
    dfx = dtm.copy()
    #
    size0 = dfx.shape[1]
    print(f"original column size: {size0}")

    dfx = dfx.loc[:, (dfx.sum(axis = 0) >= min_term_freq)]
    #
    size1 = dfx.shape[1]
    print(f"reduced column size: {size1}")
    print(f"{size0 - size1} columns are discarded")
    dtm_df = dfx

    return dtm_df

In [49]:
def run_DT_analysis (X_train: object, y_train_single: object, X_test: object, y_test_single: object, target_lang: str, target_attrib: str, use_supplement: bool, term_type: str, hashedness: str, output: TextIO, check: bool = False):
    
    from sklearn import tree
    max_depth = 20
    dt_model = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 0, criterion = 'gini')
    #dt_fitting = dt_model.fit(X_train, y_train)
    dt_fitting_single = dt_model.fit(X_train, y_train_single)

    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    # use random search to find the best hyperparameters
    dt = tree.DecisionTreeClassifier ()
    dt_search_params = { 'max_features': randint(10, 120), 'max_depth': randint(2, 40)}
    dt_rand_search = RandomizedSearchCV (dt, param_distributions = dt_search_params, n_iter = 5, cv = 5)

    # fit the random search object to the data
    dt_rand_search.fit (X_train, y_train_single)

    # determine variables for the best model
    best_dt = dt_rand_search.best_estimator_
    print('DT best hyper-parameters:', dt_rand_search.best_params_)
    
    # extract best parameter values
    best_max_depth = dt_rand_search.best_params_['max_depth']
    best_n_estimators = dt_rand_search.best_params_['max_features']
    
    ## evaluate DT model
    import numpy as np
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.metrics import ConfusionMatrixDisplay
    #dt_predict = dt_fitting.predict(X_test)
    best_dt_predict = best_dt.predict(X_test)
    
    ## Score report is compatible with multi-labels
    print(f"DT classification (max_depth: {best_max_depth}, n_estimators: {best_n_estimators}) of {target_attrib} [supplement: {use_supplement}] of {target_lang} using {term_type}{hashedness}", file = output)
    print(classification_report(y_test_single, best_dt_predict, zero_division = 0.0), file = output)

In [50]:
def run_RT_analysis (X_train: object, y_train_single: object, X_test: object, y_test_single: object, target_lang: str, target_attrib: str, use_supplement: bool, term_type: str, hashedness: str, output: TextIO, check: bool = False):

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    # use random search to find the best hyperparameters
    rf = RandomForestClassifier ()
    rf_searchc_params = { 'n_estimators': randint(10, 120), 'max_depth': randint(2, 40)}
    rf_rand_search = RandomizedSearchCV (rf, param_distributions = rf_searchc_params, n_iter = 5, cv = 5)
    # fit the random search object to the data
    rf_rand_search.fit(X_train, y_train_single)
    # determine variables for the best model
    best_rf = rf_rand_search.best_estimator_
    print('RF best hyper-parameters:', rf_rand_search.best_params_)
    # extract best parameter values
    best_max_depth = rf_rand_search.best_params_['max_depth']
    best_n_estimators = rf_rand_search.best_params_['n_estimators']
    
    ## evaluate RF model
    import numpy as np
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.metrics import ConfusionMatrixDisplay
    rf_predict = best_rf.predict(X_test)
    ##
    print (f"\nRF classification (max_depth: {best_max_depth}, n_estimators: {best_n_estimators}) of {target_attrib} [supplement: {use_supplement}]  using: {term_type}{hashedness}", file = output)
    print (classification_report(y_test_single, rf_predict, zero_division = 0), file = output)

In [51]:
def run_NN_analysis (X_train, y_train, X_test, y_test, target_lang: str, target_attrib: str, use_supplement: bool, term_type: str, hashedness: str, output: TextIO, use_Adam: bool = True, check: bool = False):

    ## build mNN model
    import numpy as np
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.optimizers import SGD, Adam
    ##
    mNN_model = Sequential()
    ## settings
    input_size  = X_train.shape[1]
    print(f"input_size: {input_size}")
    output_size = y_train.shape[1]
    print(f"output_size: {output_size}")

    import math
    k = 1.5
    base_n        = round (k * math.sqrt(input_size))
    dropout_rate  = 0.1
    ## activation for input and hidden layers
    activation_funcs         = [ 'sigmoid', 'tanh', 'relu', 'softmax' ]
    activation_func          = activation_funcs[1]
    print(f"activation_func: {activation_func}")

    ## activation for output layer
    output_activation_func   = activation_funcs[1]
    print(f"output_activation_func: {output_activation_func}")

    ## input layer
    mNN_model.add (Dense(base_n, activation = activation_func, input_dim = input_size))
    mNN_model.add (Dropout(dropout_rate))

    ## hidden layer
    n_hidden_layers = 3
    divider         = 3

    layer_ids = range (1, n_hidden_layers + 1)
    down_sized_layers = [2,4,6,8]
    for i in layer_ids:
        if i in down_sized_layers:
            n_units = int(round (base_n / divider, 0))
            print (f"adding {n_units} units at hidden layer {i}")
        else:
            n_units = base_n
            print (f"adding {n_units} units at hidden layer {i}")
        #
        mNN_model.add (Dense(input_dim = input_size, units = n_units))
        mNN_model.add (Activation(activation_func))
        mNN_model.add (Dropout(dropout_rate))

    ## output layer
    forced_choice = True
    if target_attrib in [ 'gender', 'plurality', 'case' ]:
        if forced_choice:
            output_activation_func = 'softmax'
        else:
            output_activation_func = activation_func
    print(f"output_activation_func is reset to: {output_activation_func}")
    ##
    mNN_model.add (Dense(output_size, activation = output_activation_func))

    ##
    #use_Adam = True
    lr_val = 0.01
    adam = Adam(learning_rate = lr_val)
    sgd  = SGD (learning_rate = lr_val, decay = 1e-6, momentum = 0.9, nesterov = True)
    if use_Adam:
        mNN_model.compile (loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    else:
        mNN_model.compile (loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])
    
    print("compilation NN model done")

    ## train mNN model: requires Python 3.11 or later (3.10 hangs at 1/epochs)
    mNN_model.fit (X_train, y_train, epochs = 200, verbose = 0)
    
    ## generate mNN_predict
    mNN_predict = mNN_model.predict (X_test)

    ## value conversion on mNN predict
    if output_activation_func == 'softmax':
        #threshold = 1/len(target.columns)
        threshold = 1/len(y_train.columns)
        mNN_predict [ mNN_predict >= threshold ] = int(1)
        mNN_predict [ mNN_predict < threshold ]  = int(0)
    else:
        if  output_activation_func == 'sigmoid':
            threshold = 0.5
            mNN_predict [ mNN_predict >= threshold ] = int(1)
            mNN_predict [ mNN_predict < threshold ]  = int(0)
        elif output_activation_func == 'tanh':
            threshold = 0
            mNN_predict [ mNN_predict >= threshold ] = int(1)
            mNN_predict [ mNN_predict < threshold ]  = int(0)
    print(f"output_activation_func: {output_activation_func}")
    print(f"threshold set to: {threshold}")
    
    ## evaluate mNN
    from sklearn.metrics import classification_report
    print(f"output_activation_func: {output_activation_func}")
    if output_activation_func == 'softmax':
        ## convert distribution to determnisitc values using argmax
        mNN_predict_converted = [ x.argmax() for x in mNN_predict ]
        y_test_converted = [ x.argmax() for i, x in y_test.iterrows() ]
        ##
        print (f"MM classification of {target_attrib} [supplement: {use_supplement}] in {target_lang} using: {term_type}{hashedness}", file = output)
        print (classification_report(y_test_converted, mNN_predict_converted, zero_division = 0.0), file = output)
    else:
        for i in range(len(y_train)):
            test, predict = y_test.iloc[:,i], list(map(int, mNN_predict[:,i]))
            ##
            print(f"NN classification of {target_attrib} [supplement: {use_supplement}] in {target_lang} for {y_train[i]} using {term_type}{hashedness}", file = output)
            print(classification_report(test, predict, zero_division = 0.0), file = output)

In [52]:
def run_analyses (encoded, target, target_lang, target_attrib, use_supplement, term_type, hashedness, output, check: bool = False):
    #
    ## cross-validation
    test_size_rate  = 0.1
    print(f"test_size_rate [cross validation]: {test_size_rate}")
    
    ## define training and test sets
    from sklearn.model_selection import train_test_split
    test_size_rate = 0.1
    X_train, X_test, y_train, y_test = \
        train_test_split (encoded, target, test_size = test_size_rate, random_state = 0)
    
    ## create singlified versions of y_ variables
    import utils
    reload_module = False
    if reload_module:
        from importlib import reload
        reload (utils)
    ##
    labels = list (y_train.columns)
    print (f"labels to use: {labels}")
    failure_mark = 'xxx'
    y_train_single = utils.singlify_labels (y_train, labels, failure_mark = failure_mark, check = False)
    y_test_single  = utils.singlify_labels (y_test, labels, failure_mark = failure_mark, check = False)
    
    ## Decision Tree
    run_DT_analysis (X_train, y_train_single, X_test, y_test_single, target_lang, target_attrib, use_supplement, term_type, hashedness, output)
    ## Random Forest
    run_RT_analysis (X_train, y_train_single, X_test, y_test_single, target_lang, target_attrib, use_supplement, term_type, hashedness, output)
    ## Neural Network
    run_NN_analysis (X_train, y_train, X_test, y_test, target_lang, target_attrib, use_supplement, term_type, hashedness, output, use_Adam = True)

In [53]:
def classify_words (df, dtm, target_lang, N_attributes, target_attrib, use_supplement, merge_Czech_genders, ignore_vocative, term_type, hashedness, output, check: bool = False):
    ##
    print (f"target_attrib: {target_attrib}")
    ##
    if target_attrib == 'gender':
        ## Czech
        if target_lang in [ 'Czech' ]:
            if merge_Czech_genders:
                target_cols = [ 'Fem', 'Masc', 'Neut' ]
            else:
                target_cols = [ 'Fem', 'Masc0', 'Masc1', 'Neut' ]
        ## German
        elif target_lang in [ 'German' ]:
            target_cols    = [ 'Fem', 'Masc', 'Neut' ]
        ## French
        elif target_lang in [ 'French' ]:
            ignore_Comm = False
            if ignore_Comm:
                target_cols = [ 'Fem', 'Masc' ]
            else:
                target_cols = [ 'Fem', 'Masc', 'Comm' ]
        ## Others
        else:
            target_cols  = [ 'Fem', 'Masc' ]
    ##
    elif target_attrib == 'plurality':
        ## French
        if target_lang in [ 'French' ]:
            target_cols  = [ 'Sg', 'Pl', 'Inv' ]
        ## Others
        else:
            target_cols  = [ 'Sg', 'Pl' ]
    elif target_attrib == 'case':
        if target_lang in [ 'Czech' ]:
            if ignore_vocative:
                target_cols = [ 'Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc' ]
            else:
                target_cols = [ 'Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc' ]
        elif target_lang in [ 'German' ]:
            if ignore_vocative:
                target_cols = [ 'Nom', 'Acc', 'Dat', 'Gen' ]
            else:
                target_cols = [ 'Nom', 'Acc', 'Dat', 'Gen', 'Voc' ]
        elif target_lang in [ 'Irish' ]:
            if ignore_vocative:
                target_cols = [ 'Nom', 'Dat', 'Gen' ]
            else:
                target_cols = [ 'Nom', 'Dat', 'Gen', 'Voc' ]
        else:
            print(f"Attribute {target_attrib} not defined for {target_lang}")
            #raise UnboundLocalError
            return None
    ##
    target             = df[target_cols]
    if check:
        target

    ## remove offensive lines
    remove_offensive_rows = True
    original_len = len(target)
    original_len == len(dtm)
    offensive_indices = target[target.sum(axis = 1) == 0].index
    print("indices of offensive rows")
    offensive_indices
    
    #offensive_indices
    if remove_offensive_rows:
        target = target.drop (offensive_indices)
        dtm_original = dtm.copy()
        dtm = dtm_original.drop (offensive_indices) # dtm_df cannot be used here
        print (f"{len(offensive_indices)} rows are offensive and removed")
    
    ## add Inv if plurality is target
    import numpy as np
    if target_attrib in [ 'plurality' ]:
        try:
            target['Inv']
        except KeyError:
            target['Inv'] = [ 1 if x > 1 else 0 for x in (target['Sg'] + target['Pl']) ]
        ## change original values
        target.loc[:,'Sg'] = np.where(target['Inv'] == 1, 0, target.Sg)
        target.loc[:,'Pl'] = np.where(target['Inv'] == 1, 0, target.Pl)
        ##
        target_cols = target.columns
    ##
    if check:
        target

    ## define label_to_int
    label_to_int = { name: i for i, name in enumerate (sorted (target_cols)) }
    label_to_int
    
    ## define supplement
    supplement_cols  = [ x for x in N_attributes if not x in target_cols ]
    supplement       = df[supplement_cols]
    if check:
        supplement

    ## use supplement or not
    #use_supplement   = False
    if use_supplement:
        encoded = dtm.join (supplement)
    else:
        encoded = dtm
    ##
    if check:
        encoded
    ##
    run_analyses (encoded, target, target_lang, target_attrib, use_supplement, term_type, hashedness, output)

# main

In [54]:
## install keras, tensorflow if required
#!conda install keras tensorflow -y # ineffective
#!pip install -U keras tensorflow
#!conda update conda -y

In [55]:
## main
## parameters
max_doc_length       = 9 # longer doc takes longer time to process
min_doc_length       = 3
sample_n             = 3000
print(f"sample_n: {sample_n}")

import math
min_term_freq        = round (math.sqrt(sample_n)/10) + 1
print(f"min_term_freq: {min_term_freq}")

ngram_is_inclusive   = True
print(f"ngram_is_inclusive: {ngram_is_inclusive}")

max_gap_ratio        = 1.00
max_gap_val          = round (max_doc_length* max_gap_ratio)
print(f"max_gap_val: {max_gap_val}")

## language
target_langs = [ 'Czech', 'French', 'German', 'Irish' ]
merge_Czech_genders  = False
ignore_vocative      = True

## boundary marking
hashedness_values = [ '-no-hash', '-hash-at-both' ]

## term settings
term_types   = [ f"{skippiness}{n}gram" for skippiness in [ '', 'skippy' ] for n in range(2,5) ]

## skippy_ngram
skippy_means_extended = True

## attribute setting
target_attribs = [ 'gender', 'plurality', 'case' ]

##
testing = False
if testing:
    import random
    #target_langs = target_langs[:2]
    target_langs = random.sample(target_langs, 2)
    hashedness_values = hashedness_values[:1]
    term_types = term_types[:3]
    #target_attribs = target_attribs[:2]
    target_attribs = random.sample(target_attribs, 2)
##
reduce_target_langs = True
if reduce_target_langs:
    target_langs = [ 'Czech' ]
    #target_langs = [ 'French', 'German', 'Irish' ]
##
for target_lang in target_langs:
    print (f"Creating data for target_lang: {target_lang}")
    ## open a file to output
    import datetime
    now = datetime.datetime.now()
    time_signature = "-".join(map(str, [now.year, now.month, now.day, now.hour, now.minute]))
    if skippy_means_extended:
        file = f"./results/result-{target_lang}-sample{sample_n}-xsk-mgv{max_gap_val}-{time_signature}.txt"
    else:
        file = f"./results/result-{target_lang}-sample{sample_n}-sk-mgv{max_gap_val}-{time_signature}.txt"
    with open (file, 'w', encoding = 'utf-8') as output:
        for hashedness_value in hashedness_values:
            print(f"hashedness_value: {hashedness_value}")
            for term_type in term_types:
                print(f"term_type: {term_type}")
                data_df, N_attributes = create_df (target_lang, max_doc_length, min_doc_length, sample_n)
                print (data_df)
                print (N_attributes)
                ##
                print (f"building DTM from {term_type}, {hashedness_value}...")
                bots = build_bots (data_df, sample_n, hashedness_value, term_type, max_gap_val, term_type, skippy_means_extended = skippy_means_extended)
                print (bots)
                ##
                dtm = build_dtm (bots)
                if reduce_DTM:
                    dtm = reduce_DTM (dtm, min_term_freq)
                print (dtm)
                ##
                for target_attrib in target_attribs:
                    print(f"attribute: {target_attrib} under {term_type}{hashedness_value} for {target_lang}")
                    for use_supplement in [ True, False ]:
                        print(f"use_supplement: {use_supplement}")
                        classify_words (data_df, dtm, target_lang, N_attributes, target_attrib, merge_Czech_genders, ignore_vocative, use_supplement, term_type, hashedness_value, output)
                print("===================================")
##
print(f"all analyses done")

sample_n: 3000
min_term_freq: 6
ngram_is_inclusive: True
max_gap_val: 9
Creating data for target_lang: Czech
hashedness_value: -no-hash
term_type: 2gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv
opening Inflected/C

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 26, 'max_features': 43}
RF best hyper-parameters: {'max_depth': 21, 'n_estimators': 45}
input_size: 457
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 2gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 19, 'max_features': 99}
RF best hyper-parameters: {'max_depth': 34, 'n_estimators': 20}
input_size: 457
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 13, 'max_features': 118}
RF best hyper-parameters: {'max_depth': 17, 'n_estimators': 76}
input_size: 457
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 2gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
66 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 15, 'max_features': 69}
RF best hyper-parameters: {'max_depth': 14, 'n_estimators': 98}
input_size: 457
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 14, 'max_features': 102}
RF best hyper-parameters: {'max_depth': 14, 'n_estimators': 102}
input_size: 457
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: 3gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenberg20_c

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 20, 'max_features': 57}
RF best hyper-parameters: {'max_depth': 28, 'n_estimators': 112}
input_size: 1011
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 3gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 24, 'max_features': 111}
RF best hyper-parameters: {'max_depth': 29, 'n_estimators': 63}
input_size: 1011
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 13, 'max_features': 111}
RF best hyper-parameters: {'max_depth': 29, 'n_estimators': 76}
input_size: 1011
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 3gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
71 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 22, 'max_features': 61}
RF best hyper-parameters: {'max_depth': 31, 'n_estimators': 80}
input_size: 1011
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 28, 'max_features': 73}
RF best hyper-parameters: {'max_depth': 24, 'n_estimators': 107}
input_size: 1011
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: 4gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenberg20_c

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 25, 'max_features': 82}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 64}
input_size: 1150
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 4gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 35, 'max_features': 90}
RF best hyper-parameters: {'max_depth': 34, 'n_estimators': 43}
input_size: 1150
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 16, 'max_features': 115}
RF best hyper-parameters: {'max_depth': 30, 'n_estimators': 71}
input_size: 1150
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 4gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
59 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 17, 'max_features': 118}
RF best hyper-parameters: {'max_depth': 18, 'n_estimators': 98}
input_size: 1150
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 20, 'max_features': 101}
RF best hyper-parameters: {'max_depth': 37, 'n_estimators': 60}
input_size: 1150
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy2gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenbe

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 36, 'max_features': 118}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 30}
input_size: 3071
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 83 units at hidden layer 1
adding 28 units at hidden layer 2
adding 83 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy2gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 19, 'max_features': 111}
RF best hyper-parameters: {'max_depth': 27, 'n_estimators': 36}
input_size: 3071
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 83 units at hidden layer 1
adding 28 units at hidden layer 2
adding 83 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 35, 'max_features': 72}
RF best hyper-parameters: {'max_depth': 35, 'n_estimators': 98}
input_size: 3071
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 83 units at hidden layer 1
adding 28 units at hidden layer 2
adding 83 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy2gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
64 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 22, 'max_features': 113}
RF best hyper-parameters: {'max_depth': 37, 'n_estimators': 107}
input_size: 3071
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 83 units at hidden layer 1
adding 28 units at hidden layer 2
adding 83 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 33, 'max_features': 114}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 66}
input_size: 3071
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 83 units at hidden layer 1
adding 28 units at hidden layer 2
adding 83 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy3gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenbe

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 27, 'max_features': 39}
RF best hyper-parameters: {'max_depth': 31, 'n_estimators': 92}
input_size: 9877
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy3gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 14, 'max_features': 103}
RF best hyper-parameters: {'max_depth': 35, 'n_estimators': 63}
input_size: 9877
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 28, 'max_features': 91}
RF best hyper-parameters: {'max_depth': 28, 'n_estimators': 25}
input_size: 9877
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy3gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
74 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 24, 'max_features': 109}
RF best hyper-parameters: {'max_depth': 30, 'n_estimators': 108}
input_size: 9877
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 30, 'max_features': 111}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 102}
input_size: 9877
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy4gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenber

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 38, 'max_features': 108}
RF best hyper-parameters: {'max_depth': 26, 'n_estimators': 99}
input_size: 13110
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy4gram-no-hash for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 37, 'max_features': 71}
RF best hyper-parameters: {'max_depth': 36, 'n_estimators': 21}
input_size: 13110
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 24, 'max_features': 97}
RF best hyper-parameters: {'max_depth': 32, 'n_estimators': 112}
input_size: 13110
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy4gram-no-hash for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
64 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 25, 'max_features': 54}
RF best hyper-parameters: {'max_depth': 24, 'n_estimators': 42}
input_size: 13110
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 33, 'max_features': 99}
RF best hyper-parameters: {'max_depth': 26, 'n_estimators': 46}
input_size: 13110
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 172 units at hidden layer 1
adding 57 units at hidden layer 2
adding 172 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
hashedness_value: -hash-at-both
term_type: 2gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-con

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 29, 'max_features': 62}
RF best hyper-parameters: {'max_depth': 35, 'n_estimators': 46}
input_size: 460
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 2gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 14, 'max_features': 49}
RF best hyper-parameters: {'max_depth': 38, 'n_estimators': 37}
input_size: 460
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 19, 'max_features': 49}
RF best hyper-parameters: {'max_depth': 32, 'n_estimators': 63}
input_size: 460
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 2gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
65 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 19, 'max_features': 102}
RF best hyper-parameters: {'max_depth': 17, 'n_estimators': 41}
input_size: 460
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 36, 'max_features': 101}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 68}
input_size: 460
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 32 units at hidden layer 1
adding 11 units at hidden layer 2
adding 32 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: 3gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenberg20_c

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 32, 'max_features': 53}
RF best hyper-parameters: {'max_depth': 30, 'n_estimators': 94}
input_size: 1024
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 3gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 23, 'max_features': 98}
RF best hyper-parameters: {'max_depth': 34, 'n_estimators': 92}
input_size: 1024
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 18, 'max_features': 56}
RF best hyper-parameters: {'max_depth': 38, 'n_estimators': 103}
input_size: 1024
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 3gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
73 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 11, 'max_features': 111}
RF best hyper-parameters: {'max_depth': 31, 'n_estimators': 71}
input_size: 1024
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 16, 'max_features': 117}
RF best hyper-parameters: {'max_depth': 28, 'n_estimators': 118}
input_size: 1024
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 48 units at hidden layer 1
adding 16 units at hidden layer 2
adding 48 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: 4gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenberg20_c

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 31, 'max_features': 92}
RF best hyper-parameters: {'max_depth': 22, 'n_estimators': 91}
input_size: 1166
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under 4gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 7, 'max_features': 35}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 119}
input_size: 1166
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 28, 'max_features': 103}
RF best hyper-parameters: {'max_depth': 39, 'n_estimators': 94}
input_size: 1166
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under 4gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
66 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 18, 'max_features': 102}
RF best hyper-parameters: {'max_depth': 23, 'n_estimators': 65}
input_size: 1166
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 36, 'max_features': 52}
RF best hyper-parameters: {'max_depth': 23, 'n_estimators': 51}
input_size: 1166
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 51 units at hidden layer 1
adding 17 units at hidden layer 2
adding 51 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy2gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenber

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 31, 'max_features': 83}
RF best hyper-parameters: {'max_depth': 25, 'n_estimators': 46}
input_size: 3123
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 84 units at hidden layer 1
adding 28 units at hidden layer 2
adding 84 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy2gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 12, 'max_features': 72}
RF best hyper-parameters: {'max_depth': 36, 'n_estimators': 105}
input_size: 3123
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 84 units at hidden layer 1
adding 28 units at hidden layer 2
adding 84 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 21, 'max_features': 93}
RF best hyper-parameters: {'max_depth': 31, 'n_estimators': 59}
input_size: 3123
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 84 units at hidden layer 1
adding 28 units at hidden layer 2
adding 84 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy2gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
64 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 22, 'max_features': 58}
RF best hyper-parameters: {'max_depth': 23, 'n_estimators': 77}
input_size: 3123
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 84 units at hidden layer 1
adding 28 units at hidden layer 2
adding 84 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 31, 'max_features': 70}
RF best hyper-parameters: {'max_depth': 19, 'n_estimators': 83}
input_size: 3123
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 84 units at hidden layer 1
adding 28 units at hidden layer 2
adding 84 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy3gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenbe

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 35, 'max_features': 92}
RF best hyper-parameters: {'max_depth': 34, 'n_estimators': 31}
input_size: 9851
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy3gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 38, 'max_features': 65}
RF best hyper-parameters: {'max_depth': 29, 'n_estimators': 118}
input_size: 9851
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 32, 'max_features': 106}
RF best hyper-parameters: {'max_depth': 35, 'n_estimators': 43}
input_size: 9851
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy3gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
73 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 20, 'max_features': 63}
RF best hyper-parameters: {'max_depth': 37, 'n_estimators': 23}
input_size: 9851
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 25, 'max_features': 63}
RF best hyper-parameters: {'max_depth': 33, 'n_estimators': 72}
input_size: 9851
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 149 units at hidden layer 1
adding 50 units at hidden layer 2
adding 149 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
term_type: skippy4gram
['Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv',
 'Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv',
 'Inflected/Czech/moře-concordance_preloaded_gutenberg20_cs_20241210060558.csv',
 'Inflected/Czech/muž-concordance_preloaded_gutenberg20_cs_20241210060526.csv',
 'Inflected/Czech/pan-concordance_preloaded_gutenberg20_cs_20241126115401.csv',
 'Inflected/Czech/pes-concordance_preloaded_gutenberg20_cs_20241210060453.csv',
 'Inflected/Czech/voda-concordance_preloaded_gutenberg20_cs_20241210060619.csv']
opening Inflected/Czech/kniha-concordance_preloaded_gutenberg20_cs_20241210060141.csv
opening Inflected/Czech/kočka-concordance_preloaded_gutenberg20_cs_20241210060419.csv
opening Inflected/Czech/moře-concordance_preloaded_gutenber

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: gender
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Fem', 'Masc', 'Neut']
DT best hyper-parameters: {'max_depth': 20, 'max_features': 93}
RF best hyper-parameters: {'max_depth': 21, 'n_estimators': 41}
input_size: 12923
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: plurality under skippy4gram-hash-at-both for Czech
use_supplement: True
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 30, 'max_features': 110}
RF best hyper-parameters: {'max_depth': 26, 'n_estimators': 101}
input_size: 12923
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
use_supplement: False
target_attrib: plurality
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Sg', 'Pl', 'Inv']
DT best hyper-parameters: {'max_depth': 26, 'max_features': 63}
RF best hyper-parameters: {'max_depth': 34, 'n_estimators': 27}
input_size: 12923
output_size: 3
activation_func: tanh
output_activation_func: tanh
adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
output_activation_func: softmax
threshold set to: 0.3333333333333333
output_activation_func: softmax
attribute: case under skippy4gram-hash-at-both for Czech
use_supplement: True
target_attrib: case
indices of offensive rows
73 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc']
DT best hyper-parameters: {'max_depth': 17, 'max_features': 51}
RF best hyper-parameters: {'max_depth': 24, 'n_estimators': 29}
input_size: 12923
output_size: 6
activation_func: tanh
output_activation_func: tanh
adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
output_activation_func: softmax
threshold set to: 0.16666666666666666
output_activation_func: softmax
use_supplement: False
target_attrib: case
indices of offensive rows
0 rows are offensive and removed
test_size_rate [cross validation]: 0.1
labels to use: ['Nom', 'Acc', 'Dat', 'Gen', 'Instr', 'Loc', 'Voc']
DT best hyper-parameters: {'max_depth': 39, 'max_features': 81}
RF best hyper-parameters: {'max_depth': 28, 'n_estimators': 100}
input_size: 12923
output_size: 7
activation_func: tanh
output_activation_func: tanh
adding 171 units at hidden layer 1
adding 57 units at hidden layer 2
adding 171 units at hidden layer 3
output_activation_func is reset to: softmax
compilation NN model done


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
output_activation_func: softmax
threshold set to: 0.14285714285714285
output_activation_func: softmax
all analyses done
