<a href="https://colab.research.google.com/github/kosiyyu/ml/blob/main/magic_gamma_telescope.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Data set

Download [zip](https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope) file. Add magic04.data from archive

  1.  fLength:  continuous  # major axis of ellipse [mm]
  2.  fWidth:   continuous  # minor axis of ellipse [mm]
  3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]
  4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]
  5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]
  6.  fAsym:    continuous  # distance from highest pixel to center, projected onto major axis [mm]
  7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm]
  8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]
  9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
  10.  fDist:    continuous  # distance from origin to center of ellipse [mm]
  11.  class:    g,h         # gamma (signal), hadron (background)

  g = gamma (signal):     12332
  h = hadron (background): 6688

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
cols = ['fLength', 'fWidth', 'fSize', 'fConc', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
df = pd.read_csv('magic04.data', names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [None]:
df['class'].unique()

array(['g', 'h'], dtype=object)

In [None]:
df['class'] = (df['class'] == 'g').astype(int) # g = 1, h = 0

In [None]:
# for label in cols[:-1]:
#   plt.hist(df[df['class'] == 1][label], color='blue', label='gamma', alpha=0.7, density=True)
#   plt.hist(df[df['class'] == 0][label], color='red', label='hadron', alpha=0.7, density=True)
#   plt.title(label)
#   plt.ylabel("Probability")
#   plt.xlabel(label)
#   plt.legend()
#   plt.show()

# Train validation and test data split

In [None]:
pre_train, pre_valid, pre_test = np.split(df.sample(frac=1).reset_index(drop=True), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values # matrix all cols without classes col
  y = dataframe[dataframe.columns[-1]].values # vactor of classes

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (len(y), 1))))

  return data, X, y

In [None]:
print(len(pre_train[pre_train['class'] == 1]))
print(len(pre_train[pre_train['class'] == 0]))

7384
4028


In [None]:
train, X_train, y_train = scale_dataset(pre_train, oversample=True)
valid, X_valid, y_valid = scale_dataset(pre_valid, oversample=False)
test, X_test, y_test = scale_dataset(pre_test, oversample=False)

# kNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.72      0.73      1347
           1       0.85      0.86      0.86      2457

    accuracy                           0.81      3804
   macro avg       0.79      0.79      0.79      3804
weighted avg       0.81      0.81      0.81      3804

