In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.options.display.float_format = '{:.2f}'.format

all_train_data = pd.read_csv('Dataset/TrainOnMe.csv', sep=',', na_values='?')

In [None]:
num_cols = ['x1', 'x2', 'x3', 'x4', 'x7', 'x8', 'x9', 'x10']
cat_cols = ['x5', 'x6']

# data preprocessing function
def data_prepocessing(df): 
    df = df.iloc[:, 1:]
    df = df.dropna()
    
    for col in num_cols:
        df[col] = df[col].astype('float')
    
    df = df[(df.x7 > -10) & (df.x8 < 10)]
    
    if df.x5.dtype == 'bool':
        df.x5 = df.x5.replace({True: 1, False: 0})
    if df.x5.dtype == 'O':
        df.x5 = df.x5.replace({'True': 1, 'False': 0})
    df.x6 = df.x6.replace({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'Fx': 6})
    
    return df

In [None]:
# preprocess data and extract features and labels
all_train_data = data_prepocessing(all_train_data)

num_features = all_train_data[num_cols]
cat_features = all_train_data[cat_cols]
labels = all_train_data.y

all_train_data.describe()

In [None]:
# check number of samples in each class
classes, cnts = np.unique(labels, return_counts=True)
print('classes:\n  {}'.format(classes))
print('samples:\n  {}'.format(cnts))
print('%:\n  {}'.format(cnts/len(labels)))

In [None]:
# convert to numerical labels
labels = labels.replace({'Atsuto': 0, 'Bob': 1, 'Jörg': 2})

In [None]:
# function to plot histogram of features
def features_hist(df):
    n_bins = 50
    for col in num_cols:
        ax = plt.subplots(figsize=(6,3))
        ax = plt.hist(df[col], bins=n_bins)
        title="Histogram of " + col
        plt.title(title, fontsize=12)
        plt.show()

In [None]:
# transform data to Gaussian-like
from sklearn.preprocessing import PowerTransformer

def feature_transformer(df):
    pt = PowerTransformer(standardize=False)
    transformed = pd.DataFrame(pt.fit_transform(df))

    transformed.columns=df.columns
    transformed.index=df.index
    
    return transformed

In [None]:
# plot histogram
features_hist(num_features)

In [None]:
# transform features
num_features_transformed = feature_transformer(num_features)

In [None]:
# plot histogram
features_hist(num_features_transformed)

In [None]:
# check correlation of features
cor = num_features_transformed.corr(method='pearson')
cor

In [None]:
# plot correlation
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

In [None]:
# drop redundant features
# drop_cols = ['x3']
# features_scaled = features_scaled.drop(columns=drop_cols)

In [None]:
from sklearn.model_selection import train_test_split

# split train and test dataset
features = pd.concat([num_features_transformed, cat_features], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
features.describe()

In [None]:
# check numbers of each class
classes, cnts = np.unique(y_train, return_counts=True)
print('samples in training data:\n  {}'.format(cnts))

In [None]:
# function to calculate accuracy

from sklearn.metrics import accuracy_score

def accuracy_calculator(classifier):
    pred = classifier.predict(x_test)
    accuracy = round(accuracy_score(y_test, pred), 4)
    print('test accuracy: {}'.format(accuracy))

    lb = np.array(y_test)
    acc_per_class = []

    for i in range(3):
        y_i = pred[lb==i]
        acc_per_class.append(round(1 - y_i[y_i!=i].shape[0] / y_i.shape[0], 4))

    print('accuracy of classes:\n  {}'.format(acc_per_class))

In [None]:
# classifier 1: random forest
# use GridSearchCV to tune parameters

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier()

parameters = {
    'n_estimators'      : [100],
    'max_depth'         : [None],
    'min_samples_split' : [1, 2, 3, 4, 5],
    'min_samples_leaf'  : [1, 2, 3, 4, 5],
    'random_state'      : [0],
}

clf = GridSearchCV(estimator=rf_clf, param_grid=parameters, cv=5, n_jobs=-1)

clf.fit(x_train, y_train)

best_rf = clf.best_estimator_

In [None]:
# classifier 2: extremely randomized tree
# use GridSearchCV to tune parameters

from sklearn.ensemble import ExtraTreesClassifier

ef_clf = ExtraTreesClassifier()

parameters = {
    'n_estimators'      : [100],
    'max_depth'         : [None],
    'min_samples_split' : [1, 2, 3, 4, 5],
    'min_samples_leaf'  : [1, 2, 3, 4, 5],
    'random_state'      : [0],
}

clf = GridSearchCV(estimator=ef_clf, param_grid=parameters, cv=5, n_jobs=-1)

clf.fit(x_train, y_train)

best_ef = clf.best_estimator_

In [None]:
# classifier 3: gradient tree boosting
# use GridSearchCV to tune parameters

from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier()

parameters = {
    'learning_rate'     : [0.1],
    'n_estimators'      : [100],
    'max_depth'         : [3],
    'min_samples_split' : [1, 2, 3],
    'min_samples_leaf'  : [1, 2, 3],
    'random_state'      : [0],
}

clf = GridSearchCV(estimator=gb_clf, param_grid=parameters, cv=5, n_jobs=-1)

clf.fit(x_train, y_train)

best_gb = clf.best_estimator_

In [None]:
# apply voting classifer with
# classifier 1: random forest
# classifier 2: extremely randomized tree
# classifier 3: gradient tree boosting

from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(
    estimators=[('rf', best_rf), ('ef', best_ef), ('gb', best_gb)],
    voting='soft',
    weights=[1, 1, 1],
    n_jobs=-1
)

voting.fit(x_train, y_train)

In [None]:
# accuracy of voting
# accuracy_calculator(best_rf)
# accuracy_calculator(best_ef)
# accuracy_calculator(best_gb)
accuracy_calculator(voting)

In [None]:
# process evluation data
all_evalu_data = pd.read_csv('EvaluateOnMe.csv', sep=',')

In [None]:
all_evalu_data = data_prepocessing(all_evalu_data)
all_evalu_data.describe()

In [None]:
num_data = all_evalu_data[num_cols]
cat_data = all_evalu_data[cat_cols]
num_data_transformed = feature_transformer(num_data)
features_hist(num_data_transformed)

In [None]:
data = pd.concat([num_data_transformed, cat_data], axis=1)
data.describe()

In [None]:
# make predictions
predictions = voting.predict(data)
predictions[:10]

In [None]:
# convert labels
results = []
for p in predictions:
    if p == 0:
        results.append('Atsuto')
    if p == 1:
        results.append('Bob')
    if p == 2:
        results.append('Jörg')
        
results[:10]

In [None]:
# write to txt file
with open('106716.txt', 'w') as f:
    for item in results:
        f.write("%s\n" % item)

In [None]:
# numbers in each class
classes, cnts = np.unique(results, return_counts=True)
print('all classes:\n  {}'.format(classes))
print('counts of classes:\n  {}'.format(cnts))
print('%:\n  {}'.format(cnts/len(results)))