In [1]:
#Cargamos las librerías necesarias para el análisis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import time 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import hilbert

In [None]:
df_hc = pd.read_excel('Database__writng_ML.xlsx','HC')
del df_hc['n'] # eliminamos la numeración
del df_hc['Suj']# delete the subject ID
df_hc

In [None]:
df_R = pd.read_excel('Database__writng_ML.xlsx','PD_leftAS')
del df_R['n'] # eliminamos la numeración
del df_R['Suj']# delete the subject ID
df_R

In [None]:
frames = [df_R, df_hc]
X = pd.concat(frames, ignore_index=True)
X

In [None]:
yhc=np.ones(30)*2
#yl=np.ones(21)
yr=np.ones(21)
y = np.concatenate(( yr, yhc))
y

In [None]:
X['y']=y.tolist()# insert y to plot
X_selected=X
X

In [None]:
# plots para revisar variables o features extraidas
import warnings
warnings.filterwarnings('ignore')
# separo los 3 grupos para ver si existen ouliers que se relacionan con los grupos
LPD=X_selected.loc[X_selected['y'] == 1]
#RPD=X_selected.loc[X_selected['y'] == 1]
HCs=X_selected.loc[X_selected['y'] == 2]
plt.subplots(figsize=(15,45))
sns.set_style('darkgrid')
plt.subplots_adjust (hspace=0.4, wspace=0.2)
i=0
for col in LPD.columns:
    i+=1
    plt.subplot(10,3,i)
    sns.lineplot(data=LPD[col],color='r',label='LPD')
#    sns.lineplot(data=RPD[col],color='r',label='RPD')
    sns.lineplot(data=HCs[col],color='g',label='HCs')
    plt.legend(loc='upper right')
    plt.title(col)
    if i==30:
        break

In [None]:
# Plot para analizar la distribución de cada una de las variables.
plt.subplots(figsize=(15,45))
sns.set_style('darkgrid')
plt.subplots_adjust (hspace=0.4, wspace=0.2)
i=0
for col in LPD.columns:
    i+=1
    plt.subplot(10,3,i)
    # first (0th) column of M is diagnosis, non-numerical
    sns.histplot(data=LPD[col],color="r", label="LPD", kde=True, stat="density", linewidth=0, element="poly", fill=False)
#    sns.histplot(data=RPD[col],color="r", label="RPD", kde=True, stat="density", linewidth=0, element="step", fill=False)
    sns.histplot(data=HCs[col],color="g", label="HCs", kde=True, stat="density", linewidth=0, element="step", fill=False)
    plt.legend(loc='upper right')
    plt.title(col)
    if i==12:
        break

In [None]:
# delete the label y
del X['y']
X

In [None]:
# normalize all data along columns
#test without normalize
scaler = StandardScaler() 
X = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)
X

In [None]:
# feature importance based on Tree Based Classifiers
#X = features.iloc[:,0:1578]  #independent columns
#y = features.iloc[:,-1]    #target column i.e group (LPD, RPD and HCs)
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh', figsize=(8,8))
plt.show()

In [None]:
#get correlations of each features in dataset (you can re-run adding or removing higher correlated features)
corrmat = X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(X[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
# once data preprocessing, Machine learning

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm

In [None]:
# split matrix in data for training and testing 20%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train

In [None]:
# SVM
from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
rbf = svm.SVC(kernel='rbf', gamma=10, C=0.00001).fit(X_train, Y_train)
poly = svm.SVC(kernel='poly', degree=2, C=0.00001).fit(X_train, Y_train)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
poly_accuracy = accuracy_score(Y_test, poly_pred)
poly_f1 = f1_score(Y_test, poly_pred, average='weighted')
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy*100))
print('F1 (Polynomial Kernel): ', "%.2f" % (poly_f1*100))

In [None]:
# KNN
knn=KNeighborsClassifier(n_neighbors= 20)
scores = np.mean(cross_val_score(knn,  X_train, Y_train, scoring='accuracy'))
#Train the model using the training sets
knn.fit(X_train, Y_train)
#p_scores = np.mean(cross_val_score(knn,  X_train, Y_train, scoring='precision'))
#r_scores = np.mean(cross_val_score(knn,  X_train, Y_train, scoring='recall'))
print("Accuracy: %s" % '{:.2%}'.format(scores))

In [None]:
# Random Forest Classifier
rf=RandomForestClassifier(random_state=21)
score_rf = np.mean(cross_val_score(rf,  X_train, Y_train, scoring='accuracy'))
rf.fit(X_train, Y_train)
p_score_rf = np.mean(cross_val_score(rf,  X_train, Y_train, scoring='precision'))
r_score_rf = np.mean(cross_val_score(rf,  X_train, Y_train, scoring='recall'))
print("Accuracy for RandomForest: %s" % '{:.2%}'.format(score_rf))
print("Precision for RandomForest: %s" % '{:.2%}'.format(p_score_rf))
print("Recall for RandomForest: %s" % '{:.2%}'.format(r_score_rf))

In [None]:
# Make predictions for the random forest test set in order to check overfitting.
y_pred_test = rf.predict(X_test)
# View accuracy score
accuracy_score(Y_test, y_pred_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
# Predict the labels of the test set samples
predicted_labels = rf.predict(X_test)
# Build the confusion matrix of our 3-class classification problemcnf_matrix = confusion_matrix(y_test, predicted_labels)print(cnf_matrix)
disp = plot_confusion_matrix(rf, X_test, Y_test,
                                 display_labels=['LPD','HCs'],
                                 cmap=plt.cm.Blues)
plt.show()

In [None]:
X_train.info()

In [None]:
# Import dependencies
import plotly
import plotly.graph_objs as go

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()

# Configure the trace.
trace = go.Scatter3d(
    x=X_train['FIRMA.6'],  # <-- Put your data instead
    y=X_train['POLACO.8'],  # <-- Put your data instead
    z=X_train['FIRMA.7'],  # <-- Put your data instead
    mode='markers',
    marker={
        'size': 1,
        'opacity': 0.8,
        'color': Y_train
    }
)

# Configure the layout.
layout = go.Layout(margin={'l': 0, 'r': 0, 'b': 0, 't': 0})

data = [trace]

plot_figure = go.Figure(data=data, layout=layout)

# Render the plot.
plotly.offline.iplot(plot_figure)