In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ----- make nice figures -----
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 150

from cycler import cycler
COLORS = ['#242482', '#F00D2C', '#242482', '#0071BE', '#4E8F00', '#553C67', '#DA5319', '#F00D2C']
default_cycler = cycler(color=COLORS)
plt.rc('axes', prop_cycle=default_cycler) 
# -----------------------------

In [None]:
# load data from text
data = np.loadtxt('data/lec25.txt')

num_points = data.shape[0]

# Shuffle the data to remove bias with respect to order
I_perm = np.random.permutation(len(data))
data = data[I_perm, :]

# Extract input and outputs
x = data[:, 0:2]

# The class data
y = data[:, 2]

In [None]:
# turn 0,1 to color strings just for plotting
y_color = []
for i in range(num_points):
    y_color.append(COLORS[int(y[i])])

In [None]:
mpl.rcParams['figure.dpi']= 200
plt.scatter(x[:,0], x[:,1], c=y_color)
plt.xlabel('x1')
plt.ylabel('x2')

In [None]:
class_counts = np.bincount(y.astype(int))
print("Counts:")
print(class_counts)

print("\nPercentages:")
print(100.0*class_counts/len(y))

In [None]:
from matplotlib.colors import ListedColormap

# Predict every point in the plane - you can ignore this part if you want
def plot_plane(model):
    # Form all combinations of points in the plane
    x1_plot = np.linspace(np.min(x[:,0])-0.1, np.max(x[:, 0])+0.1, 200)
    x2_plot = np.linspace(np.min(x[:,1])-0.1, np.max(x[:, 1])+0.1, 200)
    xx1, xx2 = np.meshgrid(x1_plot, x2_plot)
    # Flatten xx1 and xx2 to a list of points
    x_plot = np.array([xx1.ravel(), xx2.ravel()]).transpose()

    # classify each point
    y_plot = model.predict(x_plot)

    # shape into matrix so we can color the plane
    y_plot = y_plot.reshape(xx1.shape)
    # plot classification at each point as a colored region
    plt.pcolormesh(xx1, xx2, y_plot, cmap=ListedColormap(COLORS))

    # Plot the original data
    plt.scatter(x[:,0], x[:,1], marker='^', edgecolors='k', linewidth=0.75, c=y_color)
    plt.xlabel('X1')
    plt.ylabel('X2')
    
def report_performance(model, x_train, y_train):
    model.fit(x_train, y_train)
    
    y_valid_model = model.predict(x_valid)
    plot_plane(model)
    print(" Accuracy = " + str(accuracy_score(y_valid, y_valid_model)))
    print("Precision = " + str(precision_score(y_valid, y_valid_model)))
    print("   Recall = " + str(recall_score(y_valid, y_valid_model)))
    print("       F1 = " + str(f1_score(y_valid, y_valid_model)))

# Results ignoring class imbalance

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

num_train = int(num_points*0.8*0.8)
num_valid = int(num_points*0.8*0.2)

# Extract
x_train = x[:num_train]
y_train = y[:num_train]
    
x_valid = x[num_train:num_train+num_valid]
y_valid = y[num_train:num_train+num_valid]

x_test = x[num_train+num_valid:]
y_test = y[num_train+num_valid:]

# Normalize data
# IMPORTANT: Normalization parameters must be derived from training data
mu_x = np.mean(x_train, axis = 0)
sig_x = np.std(x_train, axis = 0)

x_train = (x_train - mu_x)/sig_x
x_valid = (x_valid - mu_x)/sig_x
x_test = (x_test - mu_x)/sig_x
x = (x - mu_x)/sig_x

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
report_performance(model, x_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
report_performance(model, x_train, y_train)

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='rbf')
report_performance(model, x_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy')
report_performance(model, x_train, y_train)

# Reweigh data.

In [None]:
model = SVC(kernel='rbf', class_weight = {0: 0.3*num_train, 1: 0.7*num_train})
report_performance(model, x_train, y_train)    

In [None]:
class0_weights = np.linspace(0.1, 1, 10)

for w in class0_weights:
    model = SVC(kernel='rbf', class_weight = {0: w*num_train, 1: (1-w)*num_train})
    print("WEIGHT = " + str(w))
    report_performance(model, x_train, y_train)    
    plt.title("Class 0 weight = " + str(100.0*w) + "%" )
    plt.show()

You can optimize this, or use the "balanced" weighting as a good rule of thumb. This weights each point inversely proportional to the size of the class the point belongs to.

In [None]:
model = SVC(kernel='rbf', class_weight = 'balanced')
report_performance(model, x_train, y_train)
plt.title("Class 0 weight = " + str(100.0*model.class_weight_[0]) + "%" )
plt.show()

Same goes for the Logistic Regression Model and classification trees

In [None]:
model = LogisticRegression(class_weight = 'balanced')
report_performance(model, x_train, y_train)

In [None]:
model = DecisionTreeClassifier(criterion='entropy', class_weight = 'balanced')
report_performance(model, x_train, y_train)

There's no change the the classification tree performance.

# Over and undersampling

In [None]:
# Separate training data into classes

I_0 = y_train == 0
I_1 = y_train == 1

x_train_0 = x_train[I_0, :]
x_train_1 = x_train[I_1, :]

y_train_0 = y_train[I_0] #(all 0s)
y_train_1 = y_train[I_1] #(all 1s)

n0 = len(y_train_0)
n1 = len(y_train_1)

print("Size of class 0 in training data: " + str(n0))
print("Size of class 0 in training data: " + str(n1))

## Undersampling

In [None]:
# Undersample majority class - select only n1 for class 0 (randomly, without replacement) 88 -> 31
I_train_0 = np.random.choice(n0, n1, replace = False)
x_train_0_undersample = x_train_0[I_train_0, :]
y_train_0_undersample = y_train_0[I_train_0] #(all 0s)

# Put the data back together: all the class 1 data and the undersampled class 0 data
x_train_undersample = np.concatenate([x_train_0_undersample, x_train_1])
y_train_undersample = np.concatenate([y_train_0_undersample, y_train_1])
print(x_train_undersample.shape)
print(y_train_undersample.shape)

In [None]:
# Train and validate some models
model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
report_performance(model, x_train_undersample, y_train_undersample)

In [None]:
model = LogisticRegression()
report_performance(model, x_train_undersample, y_train_undersample)

In [None]:
model = SVC(kernel='rbf')
report_performance(model, x_train_undersample, y_train_undersample)

In [None]:
model = DecisionTreeClassifier(criterion='entropy')
report_performance(model, x_train_undersample, y_train_undersample)

## Oversample

In [None]:
# oversample: sample n0 samples for class 1 (with replacement) 31 -> 88

I_train_1 = np.random.choice(n1, n0, replace = True)
print(I_train_1)

In [None]:
x_train_1_oversample = x_train_1[I_train_1, :]
y_train_1_oversample = y_train_1[I_train_1] #(all 1s)

# Put the data back together: all the class 0 data and the oversampled class 0 data
x_train_oversample = np.concatenate([x_train_1_oversample, x_train_0])
y_train_oversample = np.concatenate([y_train_1_oversample, y_train_0])
print(x_train_oversample.shape)
print(y_train_oversample.shape)

In [None]:
# Train some models
model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
report_performance(model, x_train_oversample, y_train_oversample)
plt.title('KNN')
plt.show()

model = LogisticRegression()
report_performance(model, x_train_oversample, y_train_oversample)
plt.title('Logistic Regression')
plt.show()

model = SVC(kernel='rbf')
report_performance(model, x_train_oversample, y_train_oversample)
plt.title('SVM')
plt.show()

model = DecisionTreeClassifier(criterion='entropy')
report_performance(model, x_train_oversample, y_train_oversample)
plt.title('Classification_tree')
plt.show()

# Synthetic Data

We'll use the SMOTE oversampling technique built into `imbalanced-learn`.

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
x_train_smote, y_train_smote = sm.fit_resample(x_train, y_train)

print(x_train.shape)
print(x_train_smote.shape)

# how is the imbalance?
print("Resampled 0 class size = " + str(np.sum(y_train_smote == 0)))
print("Resampled 1 class size = " + str(np.sum(y_train_smote == 1)))

In [None]:
# Train some models
model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
report_performance(model, x_train_smote, y_train_smote)
plt.title('KNN')
plt.show()

model = LogisticRegression()
report_performance(model, x_train_smote, y_train_smote)
plt.title('Logistic Regression')
plt.show()

model = SVC(kernel='rbf')
report_performance(model, x_train_smote, y_train_smote)
plt.title('SVM')
plt.show()

model = DecisionTreeClassifier(criterion='entropy')
report_performance(model, x_train_smote, y_train_smote)
plt.title('Classification tree')
plt.show()

Other oversampling / undersampling techniques are implemented in `imbalanced-learn`:
    
https://imbalanced-learn.readthedocs.io/en/stable/user_guide.html
        