In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from Tree import Tree
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
df = pd.read_csv('dogs.csv')
mapper = {
    'ALTER': 'owner age',
    'STADTKREIS': 'district',
    'STADTQUARTIER': 'quartier',
    'GESCHLECHT': 'owner sex',
    'RASSE1': 'breed',
    'GEBURTSJAHR_HUND': 'dog age',
    'GESCHLECHT_HUND': 'dog sex',
    'HUNDEFARBE': 'fur'
}
df.drop('HALTER_ID', axis=1, inplace=True)
df.drop('RASSE1_MISCHLING', axis=1, inplace=True)
df.drop('RASSE2', axis=1, inplace=True)
df.drop('RASSE2_MISCHLING', axis=1, inplace=True)
df.drop('RASSENTYP', axis=1, inplace=True)
df.dropna(inplace=True)
df.rename(mapper, axis=1, inplace=True)
df['dog age'] = 2017 - df['dog age']
map_age = lambda x: int(x[0])*10+5
df['owner age'] = df['owner age'].apply(map_age)
map_to_str = lambda x: 'd'+str(int(x))
df.district = df.district.apply(map_to_str)
df.quartier = df.quartier.apply(map_to_str)
df.head()

Unnamed: 0,owner age,owner sex,district,quartier,breed,dog age,dog sex,fur
0,65,m,d9,d92,Welsh Terrier,6,w,schwarz/braun
1,65,m,d3,d31,Berner Sennenhund,8,m,tricolor
2,65,w,d2,d23,Mischling gross,4,w,schwarz
3,45,m,d6,d63,Labrador Retriever,5,w,braun
4,65,w,d7,d71,Mittelschnauzer,7,w,schwarz


In [6]:
train = df.sample(frac=0.8)
test = df.drop(train.index)
records = {}
trees = {}
for col in df.columns:
    targets = df[col].value_counts().keys()
    num_targets = len(targets)
    train_copy = train.rename(columns={col:'target'})
    test_copy  =  test.rename(columns={col:'target'})
    if col == 'district':
        train_copy.drop('quartier', axis=1, inplace=True)
        test_copy.drop('quartier', axis=1, inplace=True)
    if col == 'quartier':
        train_copy.drop('district', axis=1, inplace=True)
        test_copy.drop('district', axis=1, inplace=True)
    tree = Tree(train_copy)
    predictions = test_copy.apply(tree.classify, axis=1)
    correct = predictions == test_copy.target
    accuracy = correct.value_counts()[True]/len(correct)
    print(f"{col}: {accuracy} on {num_targets} targets")
    records[col] = {}
    trees[col] = tree
    for target in targets:
        test_target = test_copy[test_copy.target==target]
        if len(test_target) < 5:
            break
        predictions_target = predictions[test_target.index]
        correct = predictions_target == test_target.target
        vc = correct.value_counts()
        if True in vc:
            accuracy = vc[True]/len(correct)
        else:
            accuracy = 0
        print(f"    {target}: {accuracy}")
        records[col][target] = {
            "num": len(test_target),
            "accuracy": accuracy
        }

        

owner age: 0.2124388539482879 on 9 targets
    55: 0.290625
    45: 0.2271186440677966
    35: 0.22556390977443608
    65: 0.16666666666666666
    25: 0.2127659574468085
    75: 0.10135135135135136
    85: 0.10526315789473684
    15: 0
owner sex: 0.6443046820405312 on 2 targets
    w: 0.7675568743818002
    m: 0.3476190476190476
district: 0.13277428371767994 on 12 targets
    d11: 0.23387096774193547
    d7: 0.2843601895734597
    d9: 0.08536585365853659
    d10: 0.0873015873015873
    d2: 0.08461538461538462
    d3: 0.06818181818181818
    d12: 0.08
    d6: 0.10101010101010101
    d8: 0.02564102564102564
    d4: 0.05
    d5: 0.07692307692307693
    d1: 0
quartier: 0.05520614954577219 on 36 targets
    d92: 0.06481481481481481
    d101: 0.06521739130434782
    d111: 0.09090909090909091
    d119: 0.06315789473684211
    d74: 0.1625
    d115: 0
    d91: 0.017857142857142856
    d21: 0.029411764705882353
    d61: 0.03773584905660377
    d72: 0.14814814814814814
    d31: 0.0377358490566037

In [7]:
import json
with open('dogs_results.json', 'w') as f:
    json.dump(records, f)