# COMP30027 Machine Learning Project 2

## Imports

In [1]:
import math
import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

## Load Datasets

In [2]:
train_raw = pd.read_csv("./Data/train_raw.csv", header=None, na_values='?', keep_default_na=False, usecols=[0, 2, 6])
train_top10 = pd.read_csv("./Data/train_top10.csv", header=None, na_values='?', keep_default_na=False)

dev_raw = pd.read_csv("./Data/dev_raw.csv", header=None, na_values='?', keep_default_na=False, usecols=[0, 2, 6])
dev_top10 = pd.read_csv("./Data/dev_top10.csv", header=None, na_values='?', keep_default_na=False)

test_raw = pd.read_csv("./Data/test_raw.csv", header=None, na_values='?', keep_default_na=False, usecols=[0, 2, 6])
test_top10 = pd.read_csv("./Data/test_top10.csv", header=None, na_values='?', keep_default_na=False)

## Train (Initial Test)
### k-NN

In [3]:
cf1 = KNeighborsClassifier(int(math.sqrt(train_top10.size)), "distance", n_jobs=-1)
cf1.fit(train_top10.values[:, 1:31], train_top10.values[:, 31])
t = cf1.predict(dev_top10.values[:, 1:31]) == dev_top10.values[:, 31]
t = np.bincount(t)
s = t[1] / (t[0] + t[1])
print(f"score: {s}")

score: 0.43027000794141


### Gaussian Naive Bayes

In [4]:
cf2 = GaussianNB()
cf2.fit(train_top10.values[:, 1:31], train_top10.values[:, 31])
t = cf2.predict(dev_top10.values[:, 1:31]) == dev_top10.values[:, 31]
t = np.bincount(t)
s = t[1] / (t[0] + t[1])
print(f"score: {s}")

score: 0.41601958881143564


### Random Decision Forest Classifier

In [5]:
cf3 = RandomForestClassifier()
cf3.fit(train_top10.values[:, 1:31], train_top10.values[:, 31])
t = cf3.predict(dev_top10.values[:, 1:31]) == dev_top10.values[:, 31]
t = np.bincount(t)
s = t[1] / (t[0] + t[1])
print(f"score: {s}")

score: 0.43384364246007234


### Multilayer Perceptron Classifier

In [6]:
cf4 = MLPClassifier()
cf4.fit(train_top10.values[:, 1:31], train_top10.values[:, 31])
t = cf4.predict(dev_top10.values[:, 1:31]) == dev_top10.values[:, 31]
t = np.bincount(t)
s = t[1] / (t[0] + t[1])
print(f"score: {s}")

score: 0.434924556604606


### Adaptive Boost Classifier with Decision Tree _(default)_

In [7]:
cf5 = AdaBoostClassifier(n_estimators=100)
cf5.fit(train_top10.values[:, 1:31], train_top10.values[:, 31])
t = cf5.predict(dev_top10.values[:, 1:31]) == dev_top10.values[:, 31]
t = np.bincount(t)
s = t[1] / (t[0] + t[1])
print(f"score: {s}")

score: 0.433667166681373


## Train (Combine UID)

First combine all UIDs

In [8]:
def unique(top10, raw):
    b = top10.copy()
    b[0] = raw[0]
    a = b.drop_duplicates(0)
    a.update(b.iloc[:, :31].groupby(0).mean())
    return a

a = unique(train_top10, train_raw)
b = unique(dev_top10, dev_raw)
c = unique(test_top10, test_raw)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[col] = expressions.where(mask, this, that)


### k-NN

In [9]:
cf1 = KNeighborsClassifier(int(math.sqrt(train_top10.size)), "distance", n_jobs=-1)
cf1.fit(a.values[:, 1:31], a.values[:, 31])

mapping = pd.Series(cf1.predict(b.values[:, 1:31]), b.iloc[:, 0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

False    0.581267
True     0.418733
Name: 31, dtype: float64


### Gradient Boosting Classifier

In [10]:
cf2 = GradientBoostingClassifier(n_estimators=1000)
cf2.fit(a.values[:, 1:31], a.values[:, 31])

mapping = pd.Series(cf2.predict(b.values[:, 1:31]), b.iloc[:, 0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

False    0.58601
True     0.41399
Name: 31, dtype: float64


### Multilayer Perceptron Classifier

In [11]:
cf3 = MLPClassifier()
cf3.fit(a.values[:, 1:31], a.values[:, 31])

mapping = pd.Series(cf3.predict(b.values[:, 1:31]), b.iloc[:, 0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

False    0.578267
True     0.421733
Name: 31, dtype: float64


### Adaptive Boost Classifier with Decision Tree _(default)_

In [12]:
cf4 = AdaBoostClassifier(n_estimators=100)
cf4.fit(a.values[:, 1:31], a.values[:, 31])

mapping = pd.Series(cf4.predict(b.values[:, 1:31]), b.iloc[:, 0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

False    0.588083
True     0.411917
Name: 31, dtype: float64


### Logistic Regression

In [13]:
cf5 = LogisticRegression()
cf5.fit(a.values[:, 1:31], a.values[:, 31])

mapping = pd.Series(cf5.predict(b.values[:, 1:31]), b.iloc[:, 0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

False    0.573458
True     0.426542
Name: 31, dtype: float64


### Ensembled _(Stacking)_

In [14]:
enc = LabelEncoder()
enc.fit(a.values[:, 31])

na = pd.DataFrame()
na[0] = a[0]
na[1] = enc.transform(cf1.predict(a.values[:, 1:31]))
na[2] = enc.transform(cf3.predict(a.values[:, 1:31]))
na[3] = enc.transform(cf5.predict(a.values[:, 1:31]))
na[4] = a[31]
nb = pd.DataFrame()
nb[0] = b[0]
nb[1] = enc.transform(cf1.predict(b.values[:, 1:31]))
nb[2] = enc.transform(cf3.predict(b.values[:, 1:31]))
nb[3] = enc.transform(cf5.predict(b.values[:, 1:31]))
nb[4] = b[31]
nc = pd.DataFrame()
nc[0] = c[0]
nc[1] = enc.transform(cf1.predict(c.values[:, 1:31]))
nc[2] = enc.transform(cf3.predict(c.values[:, 1:31]))
nc[3] = enc.transform(cf5.predict(c.values[:, 1:31]))
nc[4] = c[31]

ecf = AdaBoostClassifier(n_estimators=100)
ecf.fit(na.values[:, 1:4], na.values[:, 4])

mapping = pd.Series(ecf.predict(nb.values[:, 1:4]), nb.iloc[:, 0])
t = dev_top10.copy()
t[0] = dev_raw[0]

p = t[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
t.update(p)
re = (t[31] == dev_top10[31]).value_counts(True)
print(re)

mapping = pd.Series(ecf.predict(nc.values[:, 1:4]), nc.iloc[:, 0])
t = test_top10.copy()
t[0] = test_raw[0]

p = t[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = test_top10[0]
t.update(p)
re = t[[0, 31]]
re.columns = ["Id", "Prediction"]
re.to_csv("./Data/temp.csv", index=False)

False    0.581267
True     0.418733
Name: 31, dtype: float64


### Group by Age Groups

In [15]:
gd = train_top10.copy()
gd[0] = train_raw[0]
ga = gd.iloc[:, :31].groupby(0).sum()
ga = ga.div(ga.sum(axis=1), 0).fillna(0)
gad = gd.drop_duplicates(0)[[0, 31]]
tcf = MultinomialNB()
tcf.fit(ga.values, gad[31].values)

d = dev_top10.copy()
d[0] = dev_raw[0]
a = d.iloc[:, :31].groupby(0).sum()
a = a.div(a.sum(axis=1), 0).fillna(0)
ad = d.drop_duplicates(0)[[0, 31]]
mapping = pd.Series(tcf.predict(a.values), ad[0])
predict_dev_top10 = dev_top10.copy()
predict_dev_top10[0] = dev_raw[0]

p = predict_dev_top10[[0, 31]]
def rc(row):
    row[31] = mapping[row[0]]
    return row
p = p.apply(rc, 1, raw=True)
p[0] = dev_top10[0]
predict_dev_top10.update(p)
re = (predict_dev_top10[31] == dev_top10[31]).value_counts(True)
print(re)

True     0.521949
False    0.478051
Name: 31, dtype: float64


## Raw (TF-IDF Vectoriser)

In [16]:
tv = TfidfVectorizer(sublinear_tf=True)
rcf = SGDClassifier(random_state=41, max_iter=1000, tol=None, n_jobs=-1)

groups = {
    14: "14-16", 15: "14-16", 16: "14-16",
    24: "24-26", 25: "24-26", 26: "24-26",
    34: "34-36", 35: "34-36", 36: "34-36",
    44: "44-46", 45: "44-46", 46: "44-46"
}

### Not Combined

In [17]:
tfid = tv.fit_transform(train_raw[6].values)

rcf.fit(tfid, train_raw[2].apply(lambda x: groups.get(x, '?')).values)
print((rcf.predict(tfid) == dev_raw[31]).mean())

dfid = tv.transform(dev_raw[6].values)
print((pd.Series(rcf.predict(dfid), dev_top10.index).apply(lambda x: groups.get(x, '?')) == dev_top10[31]).mean())

0.7144800390716857
0.48246271949174974


In [None]:
tfid = tv.fit_transform(train_raw[6].values)

rcf.fit(tfid, train_raw[2].apply(.values)
print((rcf.predict(tfid) == train_raw[2]).mean())

dfid = tv.transform(dev_raw[6].values)
print((pd.Series(rcf.predict(dfid), dev_top10.index).apply(lambda x: groups.get(x, '?')) == dev_top10[31]).mean())

### Combined

In [18]:
ut = train_raw.drop_duplicates(0)
t = train_raw.groupby(0)[6].apply(lambda x: ' '.join(x))
tfid = tv.fit_transform(t)

rcf.fit(tfid, ut[2])
print((rcf.predict(tfid) == ut[2]).mean())

ud = dev_raw[[0, 2, 6]].drop_duplicates(0)
d = dev_raw.groupby(0)[6].apply(lambda x: ' '.join(x))
dfid = tv.transform(d)

p = pd.Series(rcf.predict(dfid), ud[0]).apply(lambda x: groups.get(x, '?'))

d = dev_raw[[0, 2]]
def rc(row):
    row[2] = p[row[0]]
    return row
d = d.apply(rc, 1)
print((d[2] == dev_top10[31]).mean())
ud = test_raw[[0, 2, 6]].drop_duplicates(0)
d = test_raw.groupby(0)[6].apply(lambda x: ' '.join(x))
dfid = tv.transform(d)

p = pd.Series(rcf.predict(dfid), ud[0]).apply(lambda x: groups.get(x, '?'))

d = test_raw[[0, 2]]
def rc(row):
    row[2] = p[row[0]]
    return row
d = d.apply(rc, 1)
d[0] = test_top10[0]
d.columns = ["Id", "Prediction"]
d.to_csv("./Data/temp.csv", index=False)

0.9943016759776536
0.6191211506220771


In [19]:
ut = train_raw.drop_duplicates(0)
t = train_raw.groupby(0)[6].apply(lambda x: ' '.join(x))
tfid = tv.fit_transform(t)

ut[2] = ut[2].apply(lambda x: groups.get(x, '?'))
rcf.fit(tfid, ut[2])
print((rcf.predict(tfid) == ut[2]).mean())

ud = dev_raw[[0, 2, 6]].drop_duplicates(0)
d = dev_raw.groupby(0)[6].apply(lambda x: ' '.join(x))
dfid = tv.transform(d)

p = pd.Series(rcf.predict(dfid), ud[0])

d = dev_raw[[0, 2]]
def rc(row):
    row[2] = p[row[0]]
    return row
d = d.apply(rc, 1)
print((d[2] == dev_top10[31]).mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


0.981340782122905
0.6412688608488485
