In [2]:
import os

n_cpu = os.cpu_count() - 1

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv("poker-8-9_vs_5.csv")
data = data.astype(float)
data.shape

(2075, 11)

In [5]:
data.describe()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Class
count,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0,2075.0
mean,2.528675,7.078554,2.528675,7.097831,2.528675,7.027952,2.528675,6.980723,2.528675,6.901687,0.012048
std,1.102956,3.792102,1.102956,3.767159,1.102956,3.800115,1.102956,3.853465,1.102956,3.752848,0.109127
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,4.0,2.0,4.0,2.0,4.0,2.0,4.0,2.0,4.0,0.0
50%,3.0,7.0,3.0,7.0,3.0,7.0,3.0,7.0,3.0,7.0,0.0
75%,3.0,10.0,3.0,10.0,3.0,10.0,3.0,10.0,3.0,10.0,0.0
max,4.0,13.0,4.0,13.0,4.0,13.0,4.0,13.0,4.0,13.0,1.0


In [6]:
# to figure out which one is minority class
data['Class'].value_counts()

0.0    2050
1.0      25
Name: Class, dtype: int64

In [7]:
# normalising dataset
def scale(data):
    scaled_data = (data - data.min()) / (data.max() - data.min())
    return scaled_data

In [8]:
norm_data = scale(data)
X = norm_data.drop(columns='Class')
y = data['Class']

In [9]:
majority_examples = data.loc[data['Class'] == 0]
minority_examples = data.loc[data['Class'] == 1]

# not scaled
X_majority, y_majority = majority_examples.drop(columns='Class'), majority_examples['Class']
X_minority, y_minority = minority_examples.drop(columns='Class'), minority_examples['Class']

In [10]:
majority_index = data[ data['Class'] == 0].index
minority_index = data[ data['Class'] == 1].index

In [11]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

In [12]:
# have to add f1_score for the previous one
# before the peturbing and generating synthetic examples

In [13]:
POPULATION_SIZE = 20
MAX_ITERATION = 100

In [14]:
def peturb3(row):
    row_features = (row[:-1]).copy()
    chosen = []
    for n in range(3):                                        # peturbing 3 attributes
        attribute = np.random.randint(0, X.shape[1])
        while (chosen.count(attribute) != 0):
            attribute = np.random.randint(0, X.shape[1])
        chosen.append(attribute)
        
        peturb_factor = np.random.choice([0.9, 1.1])
        row_features[attribute] *= peturb_factor

        if row_features[attribute] > 1:
            row_features[attribute] = 1
        elif row_features[attribute] < 0:
            row_features[attribute] = 0
        
    return row_features

In [15]:
def peturb(row):
    row_features = (row[:-1]).copy()

    attribute = np.random.randint(0, X.shape[1])
        
    peturb_factor = np.random.choice([0.9, 1.1])
    row_features[attribute] *= peturb_factor

    if row_features[attribute] > 1:
        row_features[attribute] = 1
    elif row_features[attribute] < 0:
        row_features[attribute] = 0
    
    return row_features

In [16]:
classifier = RandomForestClassifier(n_jobs=n_cpu, random_state=42)

gen_minority_examples = []
for i in majority_index:
    X_current = X.drop(i, axis=0)
    y_current = y.drop(i, axis=0)

    current_model = classifier.fit(X_current.values, y_current)

    example = (data.iloc[i]).copy()
    population = []
    for n in range(POPULATION_SIZE):
        peturb_example = peturb3(example)
        population.append(peturb_example)

    for n in range(MAX_ITERATION):
        probabilities = []
        for member in population:
            prob = current_model.predict_proba([member])
            probabilities.append(prob[0][1])
        
        best_prob = max(probabilities)
        best_member = population[probabilities.index(best_prob)]

        for j in range(POPULATION_SIZE):
            population[j] = population[j] + np.random.uniform(0,1)*(population[j] - best_member)
            for k in range(X.shape[1]):
                if population[j][k] > 1:
                    population[j][k] = 1
                elif population[j][k] < 0:
                    population[j][k] = 0

    for member in population:
        prob = current_model.predict_proba([member])
        probabilities.append(prob[0][1])    
    best_prob = max(probabilities)
    best_member = population[probabilities.index(best_prob)]

    print(best_prob)
    if best_prob >= 0.6:
        gen_minority_examples.append(best_member)


# ran it for 450 minutes
# generated 583 examples
# ^ the above result is without the 0.6 condition, and peturbing one attribute

0.57
0.57
0.6
0.53
0.62
0.57
0.56
0.55
0.62
0.62


KeyboardInterrupt: 

In [None]:
examples = pd.DataFrame(gen_minority_examples)
examples['Class'] = 1.0
new_dataset = norm_data.append(examples)

  new_dataset = norm_data.append(examples)


In [None]:
X_new, y_new = new_dataset.drop(columns='Class'), new_dataset['Class']

from sklearn.model_selection import train_test_split
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)


classifier.fit(X_new_train, y_new_train)
classifier.score(X_new_test, y_new_test)


# using 583 generated examples
# from a score of 0.980722891566265 (using no synthetic examples)
# got a score of 0.9943609022556391
# not f1 scores

NameError: name 'new_dataset' is not defined

In [None]:
from sklearn.metrics import f1_score

y_pred = classifier.predict(X_new)
f1_score(y_new, y_pred, average='micro')