# Import the necessary libraries to prepare the data
You need numpy and pandas to manipulate the data. If you haven't already, use pip to install these libraries.

In [11]:
import numpy as np
import pandas as pd

df = pd.read_csv('arrhythmia.data', header = None)
y = df.iloc[:, -1] #The last column is the ground-truth label vector
X = df.iloc[:,:-1] #The first to second-last columns are the features

# Impute the missing data in the dataset
As the arrhythmia dataset has missing values, you need to use Imputer from the sklearn library to assign a suitable value to it. Usually, we assign the mean value of the feature to the missing value which is reasonable.

In [12]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
X = imp.fit_transform(X)

# Normalizing the dataset
As the values of the features can range from negative values like -16 to highly positive values like 371, it is best to normalize the data to avoid certain features overly influencing the results.

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# Splitting the dataset to training and validation datasets
Assign 30% of the data randomly to the validation set for cross validation. We use ```random_state = 1``` for consistency in case we need to debug.

In [14]:
from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 1)

# Predicting with XGBClassifier
We test the results with an untuned XGBClassifier.

In [15]:
#Load the classifier
import xgboost as xgb
model = xgb.XGBClassifier(objective="multi:softprob", nthread=-1)

#Fit the classifier to the training data
model.fit(X_train, y_train)

#Predicting the results
y_train_xgb = model.predict(X_train)
y_pred_xgb = model.predict(X_val)
print 'XGB Train Score:', np.mean(y_train == y_train_xgb)
print 'XGB Val Score:', np.mean(y_val == y_pred_xgb)

XGB Train Score: 1.0
XGB Val Score: 0.764705882353


# Predicting with Random Forest Classifier
Using the XGBClassifier, we can see the model has completely overfit the training data. This shows there is a lot of potential for improvement in the XGBClassifier using hyperparameter tuning. We shall now see how a Random Forest Classifier performs in comparison to XGBClassifier

In [16]:
#Load the classifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_jobs = -1, random_state = 0)

#Fit the classifier to the training data.
forest.fit(X_train, y_train)

#Predicting the results
y_train_forest = forest.predict(X_train)
y_pred_forest = forest.predict(X_val)
print 'Random Forest Train Score:', np.mean(y_train == y_train_forest)
print 'Random Forest Val Score:', np.mean(y_val == y_pred_forest)


Random Forest Train Score: 0.981012658228
Random Forest Val Score: 0.713235294118


# Predicting with Logistic Regression
We see there is a high variance in the Random Forest model as well, since the training score is significantly higher than the validation score. Nevertheless, a pure XGBClassifier still performs better than a pure Random Forest classifier! What about Logistc Regression, then?

In [17]:
#Load the classifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

#Fit the classifier to the training data
lr.fit(X_train, y_train)

#Predicting the results
y_train_lr = lr.predict(X_train)
y_pred_lr = lr.predict(X_val)
print 'Logistic Regression Train Score:', np.mean(y_train == y_train_lr)
print 'Logistic Regression Val Score:', np.mean(y_val == y_pred_lr)

Logistic Regression Train Score: 0.990506329114
Logistic Regression Val Score: 0.705882352941


# Random Search and Hyperparameter Tuning (XGBClassifier)
We see that the Logistic Regression model performs rather well as well, although not as well as the other 2 classifiers. All 3 models have overfit the training data. We can prevent the overfitting by performing cross validation alongside with a random search for the hyperparameters to fine-tune the models. This can be done through the RandomSearchCV function from the sklearn library. 

In specific, we would like to focus on 3 main aspects of an XGBClassifier for tuning:
1. Depth of the trees
2. Subsampling ratio to control the variance.
3. Learning rate of the classifier

We will perform a randomized search with cross validation to tune the XGBClassfier model. Note that this will take quite some time, depending on your computer. My machine took 19.4 minutes to finish computing.

In [18]:
#Set the random seed for consistency of results
np.random.seed(0)

#Load the classifier
import xgboost as xgb
model = xgb.XGBClassifier(nthread=-1)

#Set up a new classifier that will randomly search through the hyperparameters
from sklearn.grid_search import RandomizedSearchCV
from numpy.random import randint, random_sample
clf = RandomizedSearchCV(
    model,
    {
        'max_depth':randint(1,10,3),
        'learning_rate': random_sample(3),
        'subsample': random_sample(2),
        'n_estimators': 10
    },
    cv=2,
    verbose=2,
    n_jobs=-1
)
print "Done"
#Fit the classifier to the training data, which will randomly search for hyperparameters
clf.fit(X_train, y_train)

print "DONE"


Done
Fitting 2 folds for each of 10 candidates, totalling 20 fits


TypeError: object of type 'int' has no len()

In [None]:
#Predicting the results
y_train_xgb = clf.predict(X_train)
y_pred_xgb = clf.predict(X_val)
print 'XGB Train Score:', np.mean(y_train == y_train_xgb)
print 'XGB Val Score:', np.mean(y_val == y_pred_xgb)