In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from Tree import Tree
import matplotlib.pyplot as plt
from tqdm import tqdm
from RandomPredictor import RandomPredictor

In [3]:
df = pd.read_csv('dogs.csv')
mapper = {
    'ALTER': 'owner age',
    'STADTKREIS': 'district',
    'STADTQUARTIER': 'quartier',
    'GESCHLECHT': 'owner sex',
    'RASSE1': 'breed',
    'GEBURTSJAHR_HUND': 'dog age',
    'GESCHLECHT_HUND': 'dog sex',
    'HUNDEFARBE': 'fur'
}
df.drop('HALTER_ID', axis=1, inplace=True)
df.drop('RASSE1_MISCHLING', axis=1, inplace=True)
df.drop('RASSE2', axis=1, inplace=True)
df.drop('RASSE2_MISCHLING', axis=1, inplace=True)
df.drop('RASSENTYP', axis=1, inplace=True)
df.dropna(inplace=True)
df.rename(mapper, axis=1, inplace=True)
df['dog age'] = 2017 - df['dog age']
map_age = lambda x: int(x[0])*10+5
df['owner age'] = df['owner age'].apply(map_age)
map_to_str = lambda x: 'd'+str(int(x))
df.district = df.district.apply(map_to_str)
df.quartier = df.quartier.apply(map_to_str)
df.head()

Unnamed: 0,owner age,owner sex,district,quartier,breed,dog age,dog sex,fur
0,65,m,d9,d92,Welsh Terrier,6,w,schwarz/braun
1,65,m,d3,d31,Berner Sennenhund,8,m,tricolor
2,65,w,d2,d23,Mischling gross,4,w,schwarz
3,45,m,d6,d63,Labrador Retriever,5,w,braun
4,65,w,d7,d71,Mittelschnauzer,7,w,schwarz


In [5]:
train = df.sample(frac=0.8)
test = df.drop(train.index)
records = {}
trees = {}
for col in df.columns:
    targets = df[col].value_counts().keys()
    num_targets = len(targets)
    train_copy = train.rename(columns={col:'target'})
    test_copy  =  test.rename(columns={col:'target'})
    if col == 'district':
        train_copy.drop('quartier', axis=1, inplace=True)
        test_copy.drop('quartier', axis=1, inplace=True)
    if col == 'quartier':
        train_copy.drop('district', axis=1, inplace=True)
        test_copy.drop('district', axis=1, inplace=True)
        
    # TREE CLASSIFICATION
    tree = Tree(train_copy)
    rand = RandomPredictor(train_copy)
    t_predictions = test_copy.apply(tree.classify, axis=1)
    r_predictions = test_copy.apply(rand.classify, axis=1)
    t_correct = t_predictions == test_copy.target
    r_correct = r_predictions == test_copy.target
    
    t_accuracy = t_correct.value_counts()[True]/len(t_correct)
    r_accuracy = r_correct.value_counts()[True]/len(r_correct)
    print(f"{col}: {t_accuracy} vs {r_accuracy} on {num_targets} targets")
    records[col] = {}
    trees[col] = tree
    for target in targets:
        test_target = test_copy[test_copy.target==target]
        if len(test_target) < 5:
            break
        t_predictions_target = t_predictions[test_target.index]
        t_correct = t_predictions_target == test_target.target
        r_predictions_target = r_predictions[test_target.index]
        r_correct = r_predictions_target == test_target.target
        tvc = t_correct.value_counts()
        rvc = r_correct.value_counts()
        if True in tvc:
            t_accuracy = tvc[True]/len(t_correct)
        else:
            t_accuracy = 0
        if True in rvc:
            r_accuracy = rvc[True]/len(r_correct)
        else:
            r_accuracy = 0
        print(f"    {target}: {t_accuracy} vs {r_accuracy}")
        records[col][target] = {
            "num": len(test_target),
            "t_accuracy": t_accuracy,
            "r_accuracy": r_accuracy
        }

        

owner age: 0.20055904961565338 vs 0.1663172606568833 on 9 targets
    55: 0.29333333333333333 vs 0.20666666666666667
    45: 0.20065789473684212 vs 0.20723684210526316
    35: 0.25 vs 0.20149253731343283
    65: 0.13658536585365855 vs 0.15121951219512195
    25: 0.18633540372670807 vs 0.11180124223602485
    75: 0.08029197080291971 vs 0.072992700729927
    85: 0.05405405405405406 vs 0
    15: 0 vs 0
owner sex: 0.6142557651991615 vs 0.5709294199860238 on 2 targets
    w: 0.7457795431976166 vs 0.6752730883813307
    m: 0.3018867924528302 vs 0.3231132075471698
district: 0.1278825995807128 vs 0.10412299091544375 on 12 targets
    d11: 0.21862348178137653 vs 0.1700404858299595
    d7: 0.21782178217821782 vs 0.13366336633663367
    d9: 0.12962962962962962 vs 0.12345679012345678
    d10: 0.08759124087591241 vs 0.08029197080291971
    d2: 0.10434782608695652 vs 0.09565217391304348
    d3: 0.03816793893129771 vs 0.0916030534351145
    d12: 0.07547169811320754 vs 0.05660377358490566
    d6: 0.07

In [6]:
import json
with open('dogs_results.json', 'w') as f:
    json.dump(records, f)