# Top10
#### This notebook processes the top10 representations of the dataset as provided.
#### Tests several classifiers with accuracy over dev data.

In [1]:
import pandas as pd

# Reads train dataset
names = ["uid"] + [i for i in range(30)] + ["age"]  # 30 words

filename = "COMP30027_2018S1_proj2-data/train_top10.csv"
train = pd.read_csv(filename, names=names)
train

Unnamed: 0,uid,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,age
0,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
1,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
2,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
3,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
4,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
5,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
6,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
7,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
8,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
9,110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26


In [2]:
# Reads dev dataset
filename = "COMP30027_2018S1_proj2-data/dev_top10.csv"
dev = pd.read_csv(filename, names=names) # Same names as above
dev

Unnamed: 0,uid,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,age
0,21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,24-26
1,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,24-26
2,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
3,24,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,24-26
4,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26
5,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,24-26
6,27,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,1,24-26
7,28,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
8,29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,24-26
9,210,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24-26


In [3]:
def get_bucket(age):
    """Accepts an age range string and returns its age bucket as an integer between 0 and 4 (inclusive)"""
    if age == "14-16":
        return 0
    if age == "24-26":
        return 1
    if age == "34-36":
        return 2
    if age == "44-46":
        return 3
    if age == "?":
        return 4

In [4]:
# Replaces string with integer age bucket in both datasets
s = set()
for i in range(len(train)):
    bucket = get_bucket(train.get_value(i, "age"))
    s.add(bucket)
    train.set_value(i, "age", bucket)
print(s) # Display buckets assigned

s = set()
for i in range(len(dev)):
    bucket = get_bucket(dev.get_value(i, "age"))
    s.add(bucket)
    dev.set_value(i, "age", bucket)
print(s)

{0, 1, 2, 3}
{0, 1, 2, 3, 4}


In [5]:
# Splits datasets into X and y to use with sklearn classifiers
x_labels = [i for i in range(30)]  # 30 words in dataset
X_train = train[x_labels]
y_train = list(train["age"])  # Classifiers don't work with series data type
X_dev = dev[x_labels]
y_dev = list(dev["age"])

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier

from sklearn.metrics import confusion_matrix

In [8]:
def print_accuracy(y, predictions):
    """Prints accuracy statistics for class predictions made, given actual (y) classes"""
    assert(len(y)==len(predictions))
    correct = 0
    for i in range(len(y)):
        if y[i] == predictions[i]:
            correct += 1
    print("correct: {}/{}".format(correct, len(y)))
    print("accuracy: {:.2%}".format(correct/len(y)))

In [9]:
# Zero-r baseline classifier
zero_r = DummyClassifier(strategy="most_frequent")
zero_r.fit(X_train, y_train)
zero_r_predictions = zero_r.predict(X_dev)
print_accuracy(y_dev, zero_r_predictions)

correct: 17298/45332
accuracy: 38.16%


In [10]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(criterion='entropy')  # Entropy performs slightly better
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_dev)
print_accuracy(y_dev, dt_predictions)

correct: 19645/45332
accuracy: 43.34%


In [11]:
# Ridge Classifier
r = RidgeClassifier()
r.fit(X_train, y_train)
r_predictions = r.predict(X_dev)
print_accuracy(y_dev, r_predictions)

correct: 18944/45332
accuracy: 41.79%


In [12]:
# SGD Classifier
s = SGDClassifier()
s.fit(X_train, y_train)
s_predictions = s.predict(X_dev)
print_accuracy(y_dev, s_predictions)

correct: 18956/45332
accuracy: 41.82%


In [13]:
# Prints confusion matrix for decision tree classifier
print(confusion_matrix(y_dev, dt_predictions, labels=[0,1,2,3,4]))

[[ 4372  8707    14     7     0]
 [ 1983 15266    40     9     0]
 [  132  2445     7     0     0]
 [   33   517     1     0     0]
 [ 2039  9730    25     5     0]]


In [14]:
# Counts amount of posts by author age bucket
BUCKETS = 4
count = [0 for i in range(BUCKETS)]
for i in range(len(train)):
    count[train.at[i,"age"]] += 1
print(count)

[98454, 141104, 30347, 6510]


In [18]:
# Discards posts so as to have an equal amount of posts for each age bucket
limit = min(count)
counts = [limit for i in range(BUCKETS)]
new_train = []
for i in range(len(train)):
    age = train.at[i,"age"]
    if counts[age] > 0:
        counts[age] -= 1
        new_train.append(train.xs(i))
new_train = pd.DataFrame(new_train)

In [19]:
# Splits new database
X_new_train = new_train[x_labels]
y_new_train = list(new_train["age"])  # Classifiers don't work with series data type

# Old decision Tree Classifier
print_accuracy(y_dev, dt_predictions)

# New decision Tree Classifier on new_train
dt_new = DecisionTreeClassifier(criterion='entropy')
dt_new.fit(X_new_train, y_new_train)
dt_new_predictions = dt_new.predict(X_dev)
print_accuracy(y_dev, dt_new_predictions)

correct: 19646/45332
accuracy: 43.34%
correct: 12463/45332
accuracy: 27.49%
