# Top10
#### This notebook processes the top10 representations of the dataset as provided.
#### Tests several classifiers with accuracy over dev data.

In [1]:
import pandas as pd

# Reads train dataset
names = ["uid"] + [i for i in range(30)] + ["age"]

filename = "COMP30027_2018S1_proj2-data/train_top10.csv"
train = pd.read_csv(filename, names=names)
train

Unnamed: 0,uid,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,age
0,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
1,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
2,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
3,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
4,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
5,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
6,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
7,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
8,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
9,110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26


In [2]:
# Reads dev dataset
filename = "COMP30027_2018S1_proj2-data/dev_top10.csv"
dev = pd.read_csv(filename, names=names) # Same names as above
dev

Unnamed: 0,uid,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,age
0,21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,24-26
1,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,24-26
2,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
3,24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
4,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
5,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,24-26
6,27,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,24-26
7,28,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
8,29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
9,210,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26


In [3]:
def get_bucket(age):
    """Accepts an age range string and returns its age bucket as an integer between 0 and 4 (inclusive)"""
    if age == "14-16":
        return 0
    if age == "24-26":
        return 1
    if age == "34-36":
        return 2
    if age == "44-46":
        return 3
    if age == "?":
        return 4

In [4]:
# Replaces string with integer age bucket in both datasets
s = set()
for i in range(len(train)):
    bucket = get_bucket(train.get_value(i, "age"))
    s.add(bucket)
    train.set_value(i, "age", bucket)
print(s) # Display buckets assigned

s = set()
for i in range(len(dev)):
    bucket = get_bucket(dev.get_value(i, "age"))
    s.add(bucket)
    dev.set_value(i, "age", bucket)
print(s)

{0, 1, 2, 3}
{0, 1, 2, 3, 4}


In [5]:
# Splits datasets into X and y to use with sklearn classifiers
x_labels = [i for i in range(30)]
X_train = train[x_labels]
y_train = train["age"]
X_dev = dev[x_labels]
y_dev = dev["age"]

In [15]:
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import BayesianRidge, Ridge

In [20]:
def print_accuracy(y, predictions, rd=False):
    assert(len(y)==len(predictions))
    correct = 0
    for i in range(len(y)):
        if y[i] == (predictions[i].round() if rd else predictions[i]):
            correct += 1
    print("correct: {}/{}".format(correct, len(y)))
    print("accuracy: {:.2%}".format(correct/len(y)))

In [21]:
# Zero-r baseline classifier
zero_r = DummyClassifier(strategy="most_frequent")
zero_r.fit(X_train, y_train)
predictions = zero_r.predict(X_dev)
print_accuracy(y_dev, predictions)

correct: 17298/45332
accuracy: 38.16%


In [25]:
# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
predictions = dt.predict(X_dev)
print_accuracy()

ValueError: Unknown label type: 'unknown'

In [24]:
# Ridge Classifier
r = Ridge()
r.fit(X_train, y_train)
predictions = r.predict(X_dev)
print_accuracy(y_dev, predictions, rd=True)

correct: 18138/45332
accuracy: 40.01%


In [31]:
from sklearn.utils.multiclass import check_classification_targets as cct
cct(y_train)

ValueError: Unknown label type: 'unknown'