<a href="https://colab.research.google.com/github/kofimuad/zero_day/blob/master/fcc_MAGIC_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Dataset:

My data set was found on this website: [https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope] It is the website for the University of California; Irvine. Donated by: P.Savicky Institute of Computer Science, AS of CR Czech Republic savisky'@'cs.cas.cz


In [3]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "FM3Long","fM3Trans","fAlpha", "fDist", "class"]
df = pd.read_csv("magic04.data", names = cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,FM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


1. fLength: continuous # major axis of ellipse [mm]
2. fWidth: continuous # minor axis of ellipse [mm]
3. fSize: continuous # 10-log of sum of content of all pixels [in #phot]
4. fConc: continuous # ratio of sum of two highest pixels over fSize [ratio]
5. fConc1: continuous # ratio of highest pixel over fSize [ratio]
6. fAsym: continuous # distance from highest pixel to center, projected onto major axis [mm]
7. fM3Long: continuous # 3rd root of third moment along major axis [mm]
8. fM3Trans: continuous # 3rd root of third moment along minor axis [mm]
9. fAlpha: continuous # angle of major axis with vector to origin [deg]
10. fDist: continuous # distance from origin to center of ellipse [mm]
11. class: g,h # gamma (signal), hadron (background)


The stuff listed above are known as features, they are things we pass into our model in order to help us predict the label; in this case is the class column.

In [5]:
df["class"] = (df["class"] == "g").astype(int) #this converts our classes to 0 and 1

In [6]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,FM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color='blue', label= 'gamma', alpha = 0.7, density =True)
  plt.hist(df[df["class"]==0][label], color='red', label= 'hadron', alpha = 0.7, density =True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

#Train, validation, test datasets

In [30]:
train, valid, test = np.split(df.sample(frac=1),[int (0.6*len(df)), int(0.8*len(df))])

In [18]:
def scale_dataset(dataframe, oversample = False):
  x = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  x = scaler.fit_transform(x)

  if oversample:
    ros = RandomOverSampler()
    x,y = ros.fit_resample(x,y)

  data = np.hstack((x, np.reshape(y, (-1,1))))

  return data, x, y

In [31]:
train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid, y_valid = scale_dataset(valid, oversample=False)
test, x_test, y_test = scale_dataset(test, oversample=False)

In [21]:
len(y_train)

14718

In [22]:
sum(y_train == 1)

7359

In [23]:
sum(y_train == 0)

7359

#kNN

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [33]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [34]:
y_pred = knn_model.predict(x_test)

In [35]:
y_pred

array([0, 1, 0, ..., 0, 1, 1])

In [36]:
y_test

array([0, 1, 0, ..., 1, 1, 1])

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.69      0.73      1339
           1       0.84      0.89      0.87      2465

    accuracy                           0.82      3804
   macro avg       0.81      0.79      0.80      3804
weighted avg       0.82      0.82      0.82      3804

