In [2]:
# Generating QSAR predictions for Fe₃O₄NPs using RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1. CARGAR LOS DATOS REALES DEL CSV
data = pd.read_csv("NanoTox _unidas.csv")

# 2. LIMPIAR LOS DATOS
# Eliminar filas completamente vacías
data = data.dropna(how='all')

# Verificar las columnas disponibles
print("Columnas disponibles en el dataset:")
print(data.columns.tolist())
print("\nPrimeras filas del dataset:")
print(data.head())

# 3. PREPARAR LOS DATOS BASE
# El CSV tiene: NPs, hydrosize, surfcharge, surfarea, Ec, Expotime, dosage, e, NOxygen, class
base_data = data[data['NPs'].notna()].copy()

# 4. DESCRIPTORES DE Fe₃O₄NPs DEL PAPER1
# Del paper: diámetro 19.07 nm, hydrodynamic 48.7 nm, zeta potential -39.9 mV
# Coating: polyvinylpyrrolidone (PVP), en 2mM citrate buffer
new_np = pd.DataFrame([{
    'hydrosize': 48.7,           # Tamaño hidrodinámico del paper
    'surfcharge': -39.9,         # Zeta potential del paper
    'surfarea': 64.7,           # Estimado basado en NPs similares en el dataset
    'NPs': 'Fe3O4'              # Tipo de nanopartícula
}])

# 5. DEFINIR CARACTERÍSTICAS Y TARGETS
# Usar las columnas que realmente existen en el CSV
descriptor_cols = ['hydrosize', 'surfcharge', 'surfarea', 'NPs']
target_cols = ['Ec', 'Expotime', 'dosage', 'e', 'NOxygen', 'class']

# Verificar que las columnas existen
missing_cols = [col for col in descriptor_cols + target_cols if col not in base_data.columns]
if missing_cols:
    print(f"\nAdvertencia: Columnas faltantes: {missing_cols}")

# 6. PIPELINE DE PREPROCESAMIENTO
numeric_features = ['hydrosize', 'surfcharge', 'surfarea']
categorical_features = ['NPs']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# 7. ENTRENAR Y PREDECIR PARA CADA TARGET
predicted_values = {}

print("\n" + "="*60)
print("PREDICCIONES PARA Fe₃O₄NPs DEL PAPER1:")
print("="*60)

for target in target_cols:
    # Filtrar datos válidos para este target
    valid_data = base_data[base_data[target].notna()].copy()
    
    if len(valid_data) < 10:
        print(f"\nAdvertencia: Pocos datos para {target} ({len(valid_data)} muestras)")
        continue
    
    X = valid_data[descriptor_cols]
    y = valid_data[target]
    
    # Crear y entrenar el pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    
    pipeline.fit(X, y)
    prediction = pipeline.predict(new_np)[0]
    predicted_values[target] = prediction
    
    print(f"\n{target}: {prediction:.4f}")

# 8. CREAR FILA COMPLETA CON DESCRIPTORES Y PREDICCIONES
new_row = new_np.copy()
for key, value in predicted_values.items():
    new_row[key] = value

# Agregar información adicional del paper
new_row['diameter'] = 19.07  # Del paper
new_row['coating'] = 'PVP'
new_row['medium'] = '2mM_citrate'
new_row['concentration_range'] = '1-100 µg/ml'  # Del paper
new_row['cell_type'] = 'D384_astrocytes'  # Más susceptibles según paper

# 9. GUARDAR RESULTADOS
output_file = "Fe3O4NPs_QSAR_predictions.csv"
new_row.to_csv(output_file, index=False)
print(f"\n{'='*60}")
print(f"Resultados guardados en: {output_file}")
print(f"{'='*60}")

# 10. MOSTRAR RESUMEN COMPLETO
print("\n" + "="*60)
print("RESUMEN DE PROPIEDADES PREDICHAS:")
print("="*60)
print(new_row.T)

# 11. ANÁLISIS ADICIONAL: Comparar con NPs similares en el dataset
print("\n" + "="*60)
print("COMPARACIÓN CON Fe2O3 NPs EN EL DATASET:")
print("="*60)

fe_nps = base_data[base_data['NPs'].str.contains('Fe', na=False)]
if len(fe_nps) > 0:
    print(f"\nNúmero de muestras de Fe NPs: {len(fe_nps)}")
    print("\nEstadísticas de Fe NPs en el dataset:")
    print(fe_nps[['hydrosize', 'surfcharge', 'surfarea', 'class']].describe())
else:
    print("\nNo se encontraron NPs de hierro en el dataset para comparación")

# 12. PREDICCIÓN DE TOXICIDAD
print("\n" + "="*60)
print("INTERPRETACIÓN DE TOXICIDAD:")
print("="*60)

if 'class' in predicted_values:
    toxicity_class = predicted_values['class']
    print(f"\nClase de toxicidad predicha: {toxicity_class:.2f}")
    print("\nInterpretación (0=no tóxico, 1=tóxico):")
    if toxicity_class < 0.3:
        print("✓ Baja toxicidad esperada")
    elif toxicity_class < 0.7:
        print("⚠ Toxicidad moderada esperada")
    else:
        print("✗ Alta toxicidad esperada")

print("\n" + "="*60)
print("NOTA: Según el PAPER1, Fe₃O₄NPs mostraron:")
print("- Mayor toxicidad en astrocitos D384 que en neuronas SH-SY5Y")
print("- Efectos dependientes de concentración y tiempo")
print("- Concentración crítica: 25 µg/ml a 4h, 1 µg/ml a 48h (D384)")
print("="*60)

Columnas disponibles en el dataset:
['NPs', 'hydrosize', 'surfcharge', 'surfarea', 'Ec', 'Expotime', 'dosage', 'e', 'NOxygen', 'class']

Primeras filas del dataset:
     NPs  hydrosize  surfcharge  surfarea    Ec  Expotime  dosage     e  \
0  Al2O3      267.0        36.3      64.7 -1.51      24.0   0.001  1.61   
1  Al2O3      267.0        36.3      64.7 -1.51      24.0   0.010  1.61   
2  Al2O3      267.0        36.3      64.7 -1.51      24.0   0.100  1.61   
3  Al2O3      267.0        36.3      64.7 -1.51      24.0   1.000  1.61   
4  Al2O3      267.0        36.3      64.7 -1.51      24.0   5.000  1.61   

   NOxygen  class  
0      3.0    0.0  
1      3.0    0.0  
2      3.0    0.0  
3      3.0    0.0  
4      3.0    0.0  

PREDICCIONES PARA Fe₃O₄NPs DEL PAPER1:

Ec: -4.0385

Expotime: 44.2629

dosage: 40.9308

e: 1.5807

NOxygen: 2.5600

class: 0.3644

Resultados guardados en: Fe3O4NPs_QSAR_predictions.csv

RESUMEN DE PROPIEDADES PREDICHAS:
                                   0
hydr

In [3]:
# feature_importance_analysis.py
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import pearsonr, spearmanr
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression

fp = Path(r"c:\Users\Lenovo\OneDrive\TESIS\codigo\NanoTox _unidas.csv")
df = pd.read_csv(fp)

target = 'class'
exclude = ['NPs', target]
predictors = [c for c in df.columns if c not in exclude]

# Convert to numeric where appropriate
for c in predictors:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df = df.dropna(subset=[target])
df[target] = df[target].astype(int)

X = df[predictors].fillna(df[predictors].median())
y = df[target].values

# Pearson & Spearman
pearson = {}
spearman = {}
for c in predictors:
    col = X[c].values
    try:
        pearson[c] = pearsonr(col, y)
    except:
        pearson[c] = (np.nan, np.nan)
    try:
        sp = spearmanr(col, y)
        spearman[c] = (sp.correlation, sp.pvalue)
    except:
        spearman[c] = (np.nan, np.nan)

# Mutual Information
mi = mutual_info_classif(X, y, discrete_features='auto', random_state=0)
mi = dict(zip(predictors, mi))

# Random Forest importances
rf = RandomForestClassifier(n_estimators=200, random_state=0, class_weight='balanced')
rf.fit(X, y)
rf_imp = dict(zip(predictors, rf.feature_importances_))

# Permutation importance
perm = permutation_importance(rf, X, y, n_repeats=30, random_state=0, n_jobs=1)
perm_imp = dict(zip(predictors, perm.importances_mean))

# VIF
vif = {}
for i, col in enumerate(predictors):
    X_i = X.drop(columns=[col]).values
    y_i = X[col].values
    lr = LinearRegression().fit(X_i, y_i)
    r2 = lr.score(X_i, y_i)
    vif[col] = 1.0 / (1.0 - r2) if r2 < 0.9999 else np.inf

# Normalize metrics and combine
def normalize(d):
    vals = np.array([d[k] if k in d and not np.isnan(d[k]) else 0.0 for k in predictors], dtype=float)
    if np.allclose(vals, 0): return {k:0.0 for k in predictors}
    vmin, vmax = vals.min(), vals.max()
    if vmax - vmin == 0:
        return {k:1.0 for k in predictors}
    norm = (vals - vmin) / (vmax - vmin)
    return dict(zip(predictors, norm))

pearson_abs = {k: abs(pearson[k][0]) if not np.isnan(pearson[k][0]) else 0.0 for k in predictors}
spearman_abs = {k: abs(spearman[k][0]) if not np.isnan(spearman[k][0]) else 0.0 for k in predictors}
norm_p = normalize(pearson_abs)
norm_s = normalize(spearman_abs)
norm_mi = normalize(mi)
norm_rf = normalize(rf_imp)
norm_perm = normalize(perm_imp)
inv_vif = {k: 1.0/(vif[k]+1.0) if not (np.isinf(vif[k])) else 0.0 for k in predictors}
norm_vif = normalize(inv_vif)

weights = np.array([1.0,1.0,1.2,1.5,1.5,0.8])
combined = {}
for k in predictors:
    parts = np.array([norm_p[k], norm_s[k], norm_mi[k], norm_rf[k], norm_perm[k], norm_vif[k]])
    combined[k] = float(np.dot(parts, weights)/weights.sum())

out = pd.DataFrame({
    'pearson_corr':[pearson[k][0] for k in predictors],
    'pearson_p':[pearson[k][1] for k in predictors],
    'spearman_corr':[spearman[k][0] for k in predictors],
    'spearman_p':[spearman[k][1] for k in predictors],
    'mi':[mi[k] for k in predictors],
    'rf_imp':[rf_imp[k] for k in predictors],
    'perm_imp':[perm_imp[k] for k in predictors],
    'vif':[vif[k] for k in predictors],
    'combined':[combined[k] for k in predictors]
}, index=predictors)

out = out.sort_values('combined', ascending=False)
out.to_csv(fp.parent / 'feature_importance_summary.csv')
print('Wrote', fp.parent / 'feature_importance_summary.csv')
print(out)

Wrote c:\Users\Lenovo\OneDrive\TESIS\codigo\feature_importance_summary.csv
            pearson_corr      pearson_p  spearman_corr     spearman_p  \
dosage          0.176943   4.752282e-11       0.384852   2.362825e-49   
NOxygen        -0.538880  1.663235e-103      -0.569070  8.464748e-118   
surfarea       -0.337319   1.274037e-37      -0.419670   2.850035e-59   
hydrosize      -0.243075   8.814144e-20      -0.164138   1.090261e-09   
e               0.359135   9.277871e-43       0.482919   1.485746e-80   
Expotime        0.258809   2.677779e-22       0.322450   2.387116e-34   
surfcharge      0.217711   4.366739e-16       0.229706   8.866568e-18   
Ec             -0.029375   2.784837e-01       0.328000   1.504971e-35   

                  mi    rf_imp  perm_imp       vif  combined  
dosage      0.284000  0.400759  0.267254  1.101951  0.828966  
NOxygen     0.215017  0.137473  0.011959  1.525841  0.525593  
surfarea    0.246640  0.112050  0.019687  1.606410  0.424146  
hydrosize   0.2