In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns 
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression 




from sklearn.metrics import mean_squared_error, r2_score , accuracy_score
from sklearn.metrics import classification_report

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# PR y F1
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc

from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek


from sklearn import metrics
from collections import Counter 

data = pd.read_csv("output/pid.csv")
data


In [None]:
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
def corr_matrix(data): #matriz de correlación
    correlation = data.corr()
    sns.heatmap(correlation, annot=True, fmt=".1f", cbar=True, cmap="RdYlGn")

corr_matrix(data)

In [None]:
print(pd.value_counts(data['pid'], sort = True))

In [None]:
data.loc[data['pid'] != 1, "pid"] = 0
data.loc[data['pid'] == 1, "pid"] = 1



In [None]:
#dataset = data[(data["pid"] == 101) | (data["pid"] == 1) | (data["pid"] == 106) |  (data["pid"] == 27) ]#proton, pions, K, lambda
#dataset = data[(data["pid"] == 101) | (data["pid"] == 1)| (data["pid"] == 106)]
dataset = data[4:]
print(pd.value_counts(dataset['pid'], sort = True))

In [None]:

sns.displot(dataset, x="pt", hue="pid", kind="kde",clip=(0.0, 100.0)) 

In [None]:
#sns.pairplot(dataset, hue='pid', height=2.25)# hue separar por alguna categoría

In [None]:
Y = dataset['pid']
X = dataset.drop('pid',axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



# LinearRegression

In [None]:
#Crear modelo de regresión lineal simple con el conjunto de entrenamiento
regresion_lineal = LinearRegression() # crear lel objeto de Regresión Linear

#Ajustar el modelo usando el modelo de la clase (debe tener mismo numero de filas tanto x como y)
regresion_lineal.fit(X_train, Y_train)

#Creando un vector de predicciones, se debe tomar solo los valores independientes
Y_pred = regresion_lineal.predict(X_test)

# Error Cuadrado Medio

print("Mean squared error: %.2f" % np.sqrt(mean_squared_error(Y_test, Y_pred)))
print("Coefficient of determination: %.2f" % r2_score(Y_test, Y_pred))



In [None]:
# Create scatter plot with actual and predicted values
plt.scatter(Y_test, Y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

# Regresión Logística

In [None]:
#lr = LogisticRegression()
lr = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced")
lr.fit(X_train,Y_train)
Y_pred = lr.predict(X_test)
print('beta_0:',lr.intercept_)
print('beta_1:',lr.coef_)

In [None]:
def run_model(X_train, X_test, Y_train, Y_test):
    clf_base = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg")
    #clf_base = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced")
    clf_base.fit(X_train, Y_train)
    return clf_base

In [None]:
def show_result(X_test, Y_test, Y_pred):
    #Generamos un clasificador sin entrenar ,  0 a todo
    ns_probs = [0 for _ in range(len(Y_test))]
    # Predecimos las probabilidades entrenando con lr
    lr_probs = model.predict_proba(X_test)
    #Nos quedamos con las probabilidades de la clase positiva (la probabilidad de 1)
    lr_probs = lr_probs[:, 1]
    # Calculamos el AUC
    ns_auc = roc_auc_score(Y_test, ns_probs)
    lr_auc = roc_auc_score(Y_test, lr_probs)
    #print('Regresión Logística: ROC AUC=%.3f' % (lr_auc))
    # Calculamos las curvas ROC
    ns_fpr, ns_tpr, _ = roc_curve(Y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(Y_test, lr_probs)

    conf_matrix = metrics.confusion_matrix(Y_test, Y_pred)
    plt.figure(figsize=(5, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap = 'Blues_r')
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()
    print (classification_report(Y_test, Y_pred))

    lr_precision, lr_recall, _ = precision_recall_curve(Y_test, lr_probs)
    lr_f1, lr_auc = f1_score(Y_test, Y_pred), auc(lr_recall, lr_precision)
    #print('Sin entrenar: ROC AUC=%.3f' % (ns_auc))
    print('Regresión Logística: auc=%.3f f1=%.3f ' % (lr_auc, lr_f1))
    no_train = len(Y_test[Y_test==1]) / len(Y_test)
    plt.figure(figsize=(15,10))

    plt.subplot(2,2,1)
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Sin entrenar')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='Regresión Logística')
    plt.xlabel('Falsos Positivos')
    plt.ylabel('Verdaderos Positivos')
    plt.legend()

    plt.subplot(2,2,2)
    plt.plot([0, 1], [no_train, no_train], linestyle='--', label='Sin entrenar')
    plt.plot(lr_recall, lr_precision, marker='.', label='Regresión Logística')
    #Etiquetas de ejes
    plt.xlabel('Sensibilidad')
    plt.ylabel('Precisión')
    plt.legend()
    plt.show()



In [None]:
model = run_model(X_train, X_test, Y_train, Y_test)
pred_y = model.predict(X_test)
show_result(X_test, Y_test, Y_pred)

In [None]:
os_us = SMOTE()
X_train_res, Y_train_res = os_us.fit_resample(X_train, Y_train)
 
print ("before resampling {}".format(Counter(Y_train)))
print ("after resampling {}".format(Counter(Y_train_res)))
 
model = run_model(X_train_res, X_test, Y_train_res, Y_test)
Y_pred = model.predict(X_test)
show_result(X_test,Y_test, Y_pred)