In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer,accuracy_score,precision_score,recall_score,f1_score
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump, load

### Data Ingestion

In [70]:
# Read data files
genetic_data = pd.read_csv('../data/01_raw/alzheimerGeneticValuesSmall.csv', header=0,index_col=0).T
phenotypes_numeric = pd.read_csv('../data/01_raw/phenotypesNumeric2.txt', sep=' ', header=None)
phenotypes_numeric = phenotypes_numeric.iloc[:, [0, 1]].rename(columns={0: '', 1: 'Diag'}).set_index('')



In [49]:
genetic_data

Unnamed: 0,rs3094315,rs3115860,rs12562034,rs12124819,rs4475691,rs28705211,rs13303118,rs3934834,rs9442372,rs3737728,...,rs518769,rs3765651,rs2034453,rs6700161,rs10493540,rs473834,rs3765660,rs1417888,rs12404339,rs12122147
002_S_0295,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
002_S_0413,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,2.0
002_S_0559,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
002_S_0619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
002_S_0685,1.0,,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941_S_1202,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
941_S_1203,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,2.0,2.0,...,0.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0
941_S_1295,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
941_S_1311,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Engineering

In [71]:
# Handling missing values
mode_values = genetic_data.mode().iloc[0]
genetic_data.fillna(mode_values, inplace=True)

In [72]:
# Data type conversion
genetic_data = genetic_data.astype('int8')

In [73]:
genetic_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 756 entries, 002_S_0295 to 941_S_1363
Columns: 10000 entries, rs3094315 to rs12122147
dtypes: int8(10000)
memory usage: 7.2+ MB


In [74]:
# Filter and relabel diagnosis
genetic_data = genetic_data[phenotypes_numeric['Diag'].isin([1, 3])]
phenotypes_numeric = phenotypes_numeric[phenotypes_numeric['Diag'].isin([1, 3])]
phenotypes_numeric[1] = phenotypes_numeric['Diag'].map({1: 0, 3: 1})

In [67]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(genetic_data, 
                                                    phenotypes_numeric[1], 
                                                    test_size=0.2, 
                                                    random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [756, 389]

### Data modeling

In [20]:
scoring = {'accuracy' : make_scorer(accuracy_score),
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score),
           'f1_score' : make_scorer(f1_score),
           'specificity':make_scorer(recall_score, pos_label = 0)}

param_grid = {
    'n_estimators': [ 250, 500, 1000 ],
    'max_features': [0.01*X_train.shape[1], round(2*np.sqrt(X_train.shape[1])), round(X_train.shape[1]*0.1), round(X_train.shape[1]*0.5)],
    'max_depth': [7,9,11,13]
}

# Defina o valor desejado para zero_division
zero_division_value = np.nan 

f1_scorer = make_scorer(f1_score, zero_division=zero_division_value)

Training Random Search 

In [21]:
rf_rs = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring=f1_scorer, n_jobs=-1, cv=5, verbose=10, random_state=42)
print('Iniciando treino:  ')
rf_rs.fit(X_train, y_train)

Iniciando treino:  
Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\alves\OneDrive\Documents\Estudos\Palestra - Kedro\Teste 1 - 23Mar\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\alves\OneDrive\Documents\Estudos\Palestra - Kedro\Teste 1 - 23Mar\venv\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\alves\OneDrive\Documents\Estudos\Palestra - Kedro\Teste 1 - 23Mar\venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constr

In [23]:
#Melhor combinação de hiperparâmetros
print("Melhor combinação de hiperparâmetros:", rf_rs.best_params_)

# Melhor modelo
best_model = rf_rs.best_estimator_

Melhor combinação de hiperparâmetros: {'n_estimators': 250, 'max_features': 200, 'max_depth': 11}


### Validating model

In [25]:
y_predict = best_model.predict(X_test)

In [26]:

print("Avaliações Random Forest  (CN vs AD) - gridsearch 5 folds")
print("------------------------------------------------------")
print('Score (Acurácia): ')
print(accuracy_score(y_test,y_predict) )
print("------------------------------------------------------")
print('Precision Score (tp / (tp + fp)):')
print(precision_score(y_test,y_predict))
print("------------------------------------------------------")
print('Recall Score (tp / (tp + fn)):')
print(recall_score(y_test,y_predict))
print("------------------------------------------------------")
print('F1 Score (F1 = 2 * (precision * recall) / (precision + recall) ):')
print(f1_score(y_test,y_predict))

Avaliações Random Forest  (CN vs AD) - gridsearch 5 folds
------------------------------------------------------
Score (Acurácia): 
0.5512820512820513
------------------------------------------------------
Precision Score (tp / (tp + fp)):
0.5
------------------------------------------------------
Recall Score (tp / (tp + fn)):
0.17142857142857143
------------------------------------------------------
F1 Score (F1 = 2 * (precision * recall) / (precision + recall) ):
0.2553191489361702


### Feature importance

In [29]:
# Obter a importância das características
feature_importances = best_model.feature_importances_

# Obter os nomes das características do DataFrame X_train
feature_names = X_train.columns

# Associar cada importância com seu respectivo nome de característica e ordenar
sorted_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)


In [30]:
print("20 SNPs most important:")
for feature_name, importance in sorted_features[:20]:
    print(f"{feature_name}: {importance:.2e}")

20 SNPs most important:
rs10493490: 2.05e-03
rs10157715: 1.99e-03
rs1796915: 1.65e-03
rs4660928: 1.49e-03
rs12145293: 1.42e-03
rs4908404: 1.32e-03
rs9793263: 1.30e-03
rs4233262: 1.27e-03
rs12565229: 1.27e-03
rs12133034: 1.19e-03
rs4912187: 1.17e-03
rs332822: 1.16e-03
rs649859: 1.16e-03
rs4926355: 1.13e-03
rs4436359: 1.10e-03
rs10493258: 1.08e-03
rs4915810: 1.06e-03
rs1191758: 1.05e-03
rs3863641: 1.05e-03
rs10917477: 1.01e-03


### Saving model 

In [24]:
from joblib import dump, load

# Salvar o melhor modelo
dump(rf_rs.best_estimator_, 'melhor_modelo.joblib')

['melhor_modelo.joblib']