In [1]:
!pip install liac-arff



In [2]:
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import requests
import arff

# import data
training_arff = requests.get('https://utexas.box.com/shared/static/7dqt596djfiz75bfzw5jz4bgk78cmkar.arff')
finn_arff = arff.load(training_arff.text)

# data cleaning
col_val = [attribute[0] for attribute in finn_arff['attributes']]
finn_df = pd.DataFrame(finn_arff['data'], columns = col_val)
X_unbalanced = finn_df.drop(columns = ['TARGET'])
y_unbalanced = finn_df.TARGET

In [3]:
# Import the libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


rstate=31
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_unbalanced, y_unbalanced, test_size=0.3333, random_state=rstate)

# Downsample majority class, where target == 0
X_resampled, y_resampled = resample(X_train[y_train == '0'], y_train[y_train == '0'], n_samples=X_train[y_train == '1'].shape[0], random_state=rstate)

# Combine minority class with downsampled majority class
X_balanced = np.concatenate((X_train[y_train == '1'], X_resampled))
y_balanced = np.concatenate((y_train[y_train == '1'], y_resampled))


# Using simply CT 
model = DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf=1, random_state=rstate)
model.fit(X_unbalanced, y_unbalanced)
print('Using simply CT on unbalanced:', cross_val_score(model, X_unbalanced, y_unbalanced, cv=3,scoring='roc_auc').mean())
model.fit(X_balanced, y_balanced)
# Need [:,1] because we only want the probability of being positive
print ("Using simply CT on balanced:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))


# Using AdaBoost
abc = AdaBoostClassifier(n_estimators=12,learning_rate=1, random_state=rstate)
abcModel = abc.fit(X_balanced, y_balanced)
# Need [:,1] because we only want the probability of being positive
print ('Using AdaBoost Classifier on balanced:', roc_auc_score(y_test, abc.predict_proba(X_test)[:,1]))


# The problem with this method is that we are not using cross-validation but only splitting the data into training and testing sets.
# We can do cross-validation by dividing the data into three folds, and each time we will down size the majority class and train the model on the balanced data. 
# The first fold will be used for testing and the other two for training. We repeat until all folds are used for testing.

Using simply CT on unbalanced: 0.5425094794912851
Using simply CT on balanced: 0.6701287358346545
Using AdaBoost Classifier on balanced: 0.8110284187657801


