In [28]:
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
from time import time

# Import data

In [31]:
def read_file(url):

    url = url + "?raw=true"
    df = pd.read_csv(url, encoding='cp1252', sep=',', low_memory=False)
    return df

In [32]:
url = "https://github.com/hannahgathu/Thesis-Data-Visualisations/blob/main/Data/dielectron_classification.csv"

df= read_file(url)

# Train classifier

In [33]:
#split data
y=df.M>21.3
#additional to dropping the endpoints we also drop the run and event numbers, because those aren't predictors
X=df.drop(columns=['M>21.3','M','Run','Event'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

#train rf
rf = RandomForestClassifier()
t0 = time()
rf.fit(X_train, y_train)
print ('training time: ', round(time()-t0, 3), 's')

#calculate accuracy
t1=time()
pred_rf = rf.predict(X_test)
print ('predicting time: ', round(time()-t1, 3), 's')
print("Accuracy:",metrics.accuracy_score(y_test, pred_rf))

training time:  22.818 s
predicting time:  0.471 s
Accuracy: 0.9582666666666667


# Add noise

In [34]:
def add_noise (mean, std, data):
    for i in range(data.shape[0]):
        data.at[i,'M']=data.iloc[i].loc['M']+np.random.normal(0,3)
    return data

In [38]:
df_noise=add_noise(0,8,df)

# Transform to classification problem

In [39]:
    for i in range(df_noise.shape[0]):
        df_noise.at[i,'M>21.3']=int(df_noise.iloc[i].loc['M']>21.3)

# Train classifier on noisy data

In [40]:
#split data
y_noise=df_noise.M>21.3
#additional to dropping the endpoints we also drop the run and event numbers, because those aren't predictors
X=df.drop(columns=['M>21.3','M','Run','Event'])
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(X,y_noise,test_size=0.3, random_state=42)

#train rf
rf = RandomForestClassifier()
t0 = time()
rf.fit(X_train_noise, y_train_noise)
print ('training time: ', round(time()-t0, 3), 's')

#calculate accuracy
t1=time()
pred_rf_noise = rf.predict(X_test_noise)
print ('predicting time: ', round(time()-t1, 3), 's')
print("Accuracy:",metrics.accuracy_score(y_test_noise, pred_rf_noise))

training time:  23.629 s
predicting time:  0.532 s
Accuracy: 0.9069
