In [170]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler # not installed in standard py library
from imblearn.over_sampling import RandomOverSampler

In [171]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("../../data_files/magic+gamma+telescope/magic04.data", names = cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [172]:
df["class"] = (df["class"] == "g").astype(int) 
# if class is g, this is converted to 1, or true
# if class is not g, this is converted to 0, or false

# Our goal
Our goal is to take the dataset we have currently and attempt to predict future classes. This is considered supervised learning because we already know a bunch of classes and their corresponding attributes. 

In [173]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [174]:
for label in cols[:-1]: # takes every category in the data frame
    plt.hist(df[df["class"] == 1][label], color = "blue", label = "gamma", alpha = 0.7, density = True) # this says to access everything in the data frame and extract where the class == 1
    plt.hist(df[df["class"] == 0][label], color = "red", label = "hadron", alpha = 0.7, density = True) 
    # note that alpha is transparency
    plt.title(label)
    plt.ylabel("probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

# Creating data sets
Here we create our training, our validation, and our tet data sets. 

In [175]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) 
# This means that 0 to 60% of the data is train, 0.6 to 0.8 is valid, and 0.8 to 1 is test
# Note that int() here is casting to an integer to discretize the number of cols taken

  return bound(*args, **kwds)


In [176]:
def scale_dataset(dataframe, oversample = False): # oversample = False is default
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if(oversample): 
        ros = RandomOverSampler(); 
        X, y = ros.fit_resample(X, y); 
        # somehow this takes the lesser of the classes and resamples it unttil they match

    # note that X is a 2d object here
    data = np.hstack((X, np.reshape(y, (-1, 1)))) 
    # stacking X and y horizontally, but you must reshape y to fit the 2d X
    # using -1 here tells the computer to infer the first dimension of y, which could also be accomplished by len(y)

    return data, X, y

In [177]:
# notably, there is a large difference in the number of data points we have
print(len(train[train["class"]==1]))
print(len(train[train["class"]==0]))
# because there is many more 1 than 0, we will want to over sample 0 to match the amount of data
# to do so we use the imblearn random sampler

train, X_train, y_train = scale_dataset(train, oversample = True)

print(len(y_train))
print(sum(y_train == 1))
print(sum(y_train == 0))
# and we can see they are equal now

# note that for our actual test sets, we do not want to over sample because we want them to replicate real data
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

7405
4007
14810
7405
7405
