In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Load the Drive helper and mount
#from google.colab import drive
#drive.mount('/content/drive')

#cheaters = np.load("drive/MyDrive/Comp 451 Final Project/data/cheaters.npy")
#clean = np.load("drive/MyDrive/Comp 451 Final Project/data/legit.npy")

In [None]:
cheaters = np.load("data/cheaters.npy")
clean = np.load("data/legit.npy")

#Upsample cheaters
cheaters = np.repeat(cheaters, 5, axis=0)

In [None]:
#Create labels for both
cheaters_labels = np.ones(10000, dtype=np.float32)
clean_labels = np.zeros(10000, dtype=np.float32)

In [None]:
#Create combined data and labels arrays

x = np.concatenate((cheaters, clean))
y = np.concatenate((cheaters_labels, clean_labels))

In [None]:
#Create training, validation, and testing sets

#20% for test set
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, stratify=y, random_state=17)

#20% for validation set, 60% for training
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, stratify=y_train, random_state=17)

In [None]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x)
        self.y = torch.tensor(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.x[i], self.y[i]


In [None]:
#Hyperparameters

num_epochs = 10
learning_rate = 0.0001
batch_size = 32

In [None]:
train_dataset = CustomDataset(x_train, y_train)
validation_dataset = CustomDataset(x_val, y_val)
test_dataset = CustomDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
#CNN - Feature Extractor

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        #Input = (1, 30, 192, 5)
        self.conv1 = nn.Conv3d(1, 16, kernel_size=3, padding='same')

        self.conv2 = nn.Conv3d(16, 64, kernel_size=3, padding='same')

        #Reduce in half
        self.pool = nn.MaxPool3d(kernel_size=2)

        #(64, 15, 96, 2)
        self.fc1 = nn.Linear(64*15*96*2, 1024)
        self.fc2 = nn.Linear(1024,512)
        self.fc3 = nn.Linear(512,1)


    def forward(self, x):
        #Add a dimension for channel
        x = x.unsqueeze(1)

        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = self.pool(x)

        x = torch.flatten(x, 1)

        x = F.relu(self.fc1(x))
        features = F.relu(self.fc2(x))
        x = self.fc3(features)

        return features, x

In [None]:
model = Net()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

In [None]:
#CNN MODEL TRAINING

best_validation_loss = np.inf
best_model = copy.deepcopy(model.state_dict())

for epoch in range(num_epochs):
    training_loss = 0.0

    model.train()

    #TRAINING
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # forward + backward + optimize
        _, outputs = model(inputs)
        outputs = outputs.squeeze()

        #print(outputs)

        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        training_loss += loss.item()


    #VALIDATION
    model.eval()
    validation_loss = 0.0

    with torch.no_grad():
        for inputs, labels in validation_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            _, outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = loss_fn(outputs, labels)

            validation_loss += loss.item()

    validation_loss = validation_loss / len(validation_loader)

    scheduler.step(validation_loss)

    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        best_model = copy.deepcopy(model.state_dict())

    print(training_loss / len(train_loader))
    print(validation_loss)

# Load the best model
model.load_state_dict(best_model)
print(f'Best Validation Loss: {best_validation_loss:.4f}')

0.9503542151451111
0.6940063591003418
0.50388480981191
0.4509412086009979
0.2615954910318057
0.3577180780172348
0.09160578254858653
0.22154358583688735
0.03954578260580699
0.26642551746964455
0.01855705252289772
0.28914263350516556
0.021648334998249388
0.3173749392777681
0.010690175398020073
0.2197580417599529
0.003954579912940972
0.29578026963304727
0.0034338398527373405
0.27652486599050463
Best Validation Loss: 0.2198


In [None]:
#Getting features from CNN.
#We extract features from the entire dataset, as it will not be updating weights at this stage
#The CNN has never used the test data to update its weights

rf_x_train = np.concatenate((x_train, x_val))
rf_y_train = np.concatenate((y_train, y_val))

rf_train_dataset = CustomDataset(rf_x_train, rf_y_train)
rf_dataloader = DataLoader(rf_train_dataset, batch_size=batch_size, shuffle=False)

model.eval()
features_list_train = []

with torch.no_grad():
    for inputs, _ in rf_dataloader:
        inputs = inputs.to(device)
        features, _ = model(inputs)
        features.squeeze()
        features_list_train.append(features.cpu().numpy())

features_list_train = np.vstack(features_list_train) #Flattening

features_list_test = []
with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        features, _ = model(inputs)
        features.squeeze()
        features_list_test.append(features.cpu().numpy())

features_list_test = np.vstack(features_list_test) #Flattening

In [None]:
#The random forest is fit based on the features extracted by the CNN

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

rf = RandomForestClassifier(n_estimators=100, random_state=17, class_weight='balanced')
rf.fit(features_list_train, rf_y_train)

# Evaluate the model
y_pred = rf.predict(features_list_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Random Forest Performance:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}")

Random Forest Performance:
Accuracy: 97.12%
Precision: 0.9456
