In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler # not installed in standard py library
from imblearn.over_sampling import RandomOverSampler

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("../data_files/magic+gamma+telescope/magic04.data", names = cols)
df.head()

In [None]:
df["class"] = (df["class"] == "g").astype(int) 
# if class is g, this is converted to 1, or true
# if class is not g, this is converted to 0, or false

# Our goal
Our goal is to take the dataset we have currently and attempt to predict future classes. This is considered supervised learning because we already know a bunch of classes and their corresponding attributes. 

In [None]:
df.head()

In [None]:
for label in cols[:-1]: # takes every category in the data frame
    plt.hist(df[df["class"] == 1][label], color = "blue", label = "gamma", alpha = 0.7, density = True) # this says to access everything in the data frame and extract where the class == 1
    plt.hist(df[df["class"] == 0][label], color = "red", label = "hadron", alpha = 0.7, density = True) 
    # note that alpha is transparency
    plt.title(label)
    plt.ylabel("probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

# Creating data sets
Here we create our training, our validation, and our tet data sets. 

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) 
# This means that 0 to 60% of the data is train, 0.6 to 0.8 is valid, and 0.8 to 1 is test
# Note that int() here is casting to an integer to discretize the number of cols taken

In [None]:
def scale_dataset(dataframe, oversample = False): # oversample = False is default
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if(oversample): 
        ros = RandomOverSampler(); 
        X, y = ros.fit_resample(X, y); 
        # somehow this takes the lesser of the classes and resamples it unttil they match

    # note that X is a 2d object here
    data = np.hstack((X, np.reshape(y, (-1, 1)))) 
    # stacking X and y horizontally, but you must reshape y to fit the 2d X
    # using -1 here tells the computer to infer the first dimension of y, which could also be accomplished by len(y)

    return data, X, y

In [None]:
# notably, there is a large difference in the number of data points we have
print(len(train[train["class"]==1]))
print(len(train[train["class"]==0]))
# because there is many more 1 than 0, we will want to over sample 0 to match the amount of data
# to do so we use the imblearn random sampler

train, X_train, y_train = scale_dataset(train, oversample = True)

print(len(y_train))
print(sum(y_train == 1))
print(sum(y_train == 0))
# and we can see they are equal now

# note that for our actual test sets, we do not want to over sample because we want them to replicate real data
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

# K nearest neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)
print(y_pred)
print(y_test)
print(classification_report(y_test, y_pred))
# accuracy is not bad!
# precision and recall... I don't know what that means
# interestingly, running with n = 1 showed the highest accuracy (at 82%)---using more neighbors is not necessarily good

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_prod = nb_model.predict(X_test)
print(classification_report(y_test, y_prod))
# Naive Bayes essentially uses Bayes theorem to calculate the probability of some point being in some classification
# the accuracy for this is actually worse, in this case, than KNN

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_prod = lg_model.predict(X_test)
print(classification_report(y_test, y_prod))
# KNN still performs the best so far!

# SVM

In [None]:
from sklearn.svm import SVC # support vector classifier

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_prod = svm_model.predict(X_test)
print(classification_report(y_test, y_prod))
# best accuracy so far! 

# Neural networks
Here we will use tensorflow

In [None]:
import tensorflow as tf

In [None]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10, 4))
    ax1.plot(history.history['loss'], label='loss')
    ax1.plot(history.history['val_loss'], label='val_loss')
    ax1.set_xlabel('Epoch') # Epoch is a training cycle, so we plot loss over training cycles
    ax1.set_ylabel('Binary crossentropy')
    ax1.grid(True)

    ax2.plot(history.history['accuracy'], label='accuracy')
    ax2.plot(history.history['val_accuracy'], label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('accuracy')
    ax2.grid(True)

    plt.show()

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)), # the first argument is the # of nodes in the layers
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation='relu'), # do not need input shape after initial layer
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation='sigmoid') # projects predictions to be 0 or 1
    ]) # creates a linear stack of layers as a model

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy', metrics=['accuracy'])
    # lr is the learning rate, accuracy is an additional metric to consider which we can plot later

    history = nn_model.fit(
        X_train, y_train, epochs = epochs, batch_size = batch_size, validation_split=0.2, verbose = 0
    )
    # epoch is # of trials, validation_split is the fraction of training data used per training
    # verbose = 0 stops it from printing everything

    return nn_model, history

In [None]:
least_val_loss = float('inf') # set to infinity to start, so every model wins
least_loss_model = None # set to empty
epochs = 100
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.1, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch size {batch_size}")
                model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_history(history)
                val_loss = model.evaluate(X_valid, y_valid) # note that this includes both loss and accuracy
                if float(val_loss[0]) < least_val_loss: # storing lowest loss value and model
                    least_val_loss = float(val_loss[0])
                    least_loss_model = model 

In [None]:
y_pred = least_loss_model.predict(X_test) 
y_pred = (y_pred > 0.5).astype(int).reshape(-1,) # converts predictions to 1 or 0 for class

In [None]:
print(classification_report(y_test, y_pred))
# we get about an 87% accuracy 
# ... which is ... just okay lol. Not much better than KNN or SVM