In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.ticker import MultipleLocator
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import f_oneway, stats
from scipy.stats.mstats import winsorize

In [32]:
df = pd.read_csv('train.csv', index_col='id')
df

Unnamed: 0_level_0,allelectrons_Total,density_Total,allelectrons_Average,val_e_Average,atomicweight_Average,ionenergy_Average,el_neg_chi_Average,R_vdw_element_Average,R_cov_element_Average,zaratio_Average,density_Average,Hardness
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,100.0,0.841611,10.000000,4.800000,20.612526,11.088100,2.766000,1.732000,0.860000,0.496070,0.91457,6.0
1,100.0,7.558488,10.000000,4.800000,20.298893,12.040830,2.755000,1.631000,0.910000,0.492719,0.71760,6.5
2,76.0,8.885992,15.600000,5.600000,33.739258,12.086300,2.828000,1.788000,0.864000,0.481478,1.50633,2.5
3,100.0,8.795296,10.000000,4.800000,20.213349,10.948500,2.648000,1.626000,0.936000,0.489272,0.78937,6.0
4,116.0,9.577996,11.600000,4.800000,24.988133,11.824480,2.766000,1.682000,0.896000,0.492736,1.86481,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10402,128.0,7.558488,12.000000,4.000000,26.385218,11.330440,2.644000,1.631000,0.892000,0.496070,1.79607,4.0
10403,30.0,1.743160,10.000000,5.333333,20.766935,14.163933,3.090000,1.556667,0.866667,0.480390,0.81480,5.0
10404,196.0,30.920000,24.500000,5.500000,53.490297,10.074300,2.295000,1.545000,1.120000,0.469715,2.11540,1.8
10405,38.0,1.553160,12.666667,4.666667,26.621687,11.290033,2.743333,1.756667,0.980000,0.486507,0.77755,6.0


### Preprocess-Reduccion de dimensionalidad (PCA)

In [33]:
def reduc_pca(df):
    df_copy = df.copy()
    
    # Selecciona las características afectadas por multicolinealidad
    features_to_pca = df_copy[['allelectrons_Average', 'density_Average', 'R_vdw_element_Average', 'atomicweight_Average','zaratio_Average','R_cov_element_Average']]
    
    # Estandariza las características antes de aplicar PCA
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_to_pca)
    
    # Aplica PCA
    pca = PCA(n_components=0.95) 
    components = pca.fit_transform(features_scaled)
    print(f"Número de componentes seleccionados: {pca.n_components_}")
    
    # Crea nombres para los nuevos componentes
    new_component_names = [f'PCA_Component_{i+1}' for i in range(components.shape[1])]
    
    # Añade los nuevos componentes al DataFrame
    df_copy[new_component_names] = components
    
    # Elimina las características originales afectadas por multicolinealidad
    df_copy.drop(['allelectrons_Average', 'density_Average', 'R_vdw_element_Average', 'atomicweight_Average', 'zaratio_Average','R_cov_element_Average'], axis=1, inplace=True)
    
    # movemos Hardness al final del df
    Hardness = df_copy.pop('Hardness')
    df_copy['Hardness'] = Hardness
    
    return df_copy
    

In [34]:
df = reduc_pca(df)

Número de componentes seleccionados: 4


In [35]:
df.head()

Unnamed: 0_level_0,allelectrons_Total,density_Total,val_e_Average,ionenergy_Average,el_neg_chi_Average,PCA_Component_1,PCA_Component_2,PCA_Component_3,PCA_Component_4,Hardness
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,100.0,0.841611,4.8,11.0881,2.766,-1.111271,-0.149722,-0.401556,-0.026638,6.0
1,100.0,7.558488,4.8,12.04083,2.755,-1.224308,0.074271,-0.379823,0.083014,6.5
2,76.0,8.885992,5.6,12.0863,2.828,-0.321924,0.029238,-0.251617,0.128897,2.5
3,100.0,8.795296,4.8,10.9485,2.648,-1.146938,0.094446,-0.448528,0.047033,6.0
4,116.0,9.577996,4.8,11.82448,2.766,-0.746181,0.087805,-0.13187,-0.272518,6.0


### Eliminar valores atipicos (solo en dataset de entrenamiento)

In [36]:
# Eliminar atipicos
def outliers_iqr(df, column_names, lower_limit=0.01, upper_limit=0.99):
    copy_df = df.copy()
    
    for column_name in column_names:
        # Calcula percentiles para limitar los valores atípicos
        lower_percentile = copy_df[column_name].quantile(lower_limit)
        upper_percentile = copy_df[column_name].quantile(upper_limit)

        # Identifica y elimina las filas que contienen valores atípicos
        copy_df = copy_df[~((copy_df[column_name] < lower_percentile) | (copy_df[column_name] > upper_percentile))]

    return copy_df
    

In [37]:
df = outliers_iqr(df, ['allelectrons_Total','density_Total','ionenergy_Average','el_neg_chi_Average'], 
                            lower_limit=0.01, upper_limit=0.99)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9695 entries, 1 to 10406
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   allelectrons_Total  9695 non-null   float64
 1   density_Total       9695 non-null   float64
 2   val_e_Average       9695 non-null   float64
 3   ionenergy_Average   9695 non-null   float64
 4   el_neg_chi_Average  9695 non-null   float64
 5   PCA_Component_1     9695 non-null   float64
 6   PCA_Component_2     9695 non-null   float64
 7   PCA_Component_3     9695 non-null   float64
 8   PCA_Component_4     9695 non-null   float64
 9   Hardness            9695 non-null   float64
dtypes: float64(10)
memory usage: 833.2 KB


In [43]:
# Guardar csv
df.to_csv('clustering_kmeans.csv', index=False)

In [39]:
# Separamos en X e y
X = df.iloc[:, :-1]
y = df.iloc[:, 9]

In [42]:
X

Unnamed: 0_level_0,allelectrons_Total,density_Total,val_e_Average,ionenergy_Average,el_neg_chi_Average,PCA_Component_1,PCA_Component_2,PCA_Component_3,PCA_Component_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,100.0,7.558488,4.800000,12.040830,2.755000,-1.224308,0.074271,-0.379823,0.083014
2,76.0,8.885992,5.600000,12.086300,2.828000,-0.321924,0.029238,-0.251617,0.128897
3,100.0,8.795296,4.800000,10.948500,2.648000,-1.146938,0.094446,-0.448528,0.047033
4,116.0,9.577996,4.800000,11.824480,2.766000,-0.746181,0.087805,-0.131870,-0.272518
5,131.0,24.529328,4.666667,11.948200,2.858333,0.650988,0.498949,0.228052,0.043948
...,...,...,...,...,...,...,...,...,...
10401,104.0,11.202328,4.666667,11.408267,2.736667,0.223466,-0.117071,-0.265490,0.088662
10402,128.0,7.558488,4.000000,11.330440,2.644000,-0.834660,0.199806,0.028256,-0.180184
10404,196.0,30.920000,5.500000,10.074300,2.295000,0.766877,0.669752,0.137073,0.594841
10405,38.0,1.553160,4.666667,11.290033,2.743333,-0.546023,-0.250037,-0.706767,0.190659


### Algoritmo de agrupacion -  nuevas caracteristicas y patrones