# Modelo Random Forest con clusteres de Kmedoids

In [1085]:
# importamos las librerias necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1086]:
# importamos los datos en un dataframe con nombre data_completa
data_completa = pd.read_csv('https://raw.githubusercontent.com/KevinRamosRivas/G2-MINERIA_DATOS/main/IGP_Datos-Sismicos_Consolidados_V3.csv')

In [1087]:
data_completa.head()

Unnamed: 0,provincia,region,mes,año,latitud(º),longitud(º),profundidad_km,magnitud_Mw
0,Huaral,Lima,1,1940,-11.5,-77.0,50.0,5.82
1,Cañete,Lima,7,1945,-13.0,-76.2,80.0,4.97
2,Cañete,Lima,8,1945,-13.0,-76.2,80.0,4.97
3,Cañete,Lima,9,1945,-13.0,-76.2,80.0,4.52
4,Cañete,Lima,9,1945,-13.0,-76.2,80.0,4.97


In [1088]:
# separamos la data en categorica y numerica
data_numerica = data_completa.select_dtypes(include = np.number)
data_categorica = data_completa.select_dtypes(include = np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_categorica = data_completa.select_dtypes(include = np.object)


In [1089]:
# separamos la data en X e y
X = data_numerica.drop(['magnitud_Mw'], axis = 1)
y = data_numerica['magnitud_Mw']

In [1090]:
# separamos la data en train y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [1091]:
# importamos el modelo de random forest
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

# entrenamos el modelo
regressor.fit(X_train, y_train)

# predecimos los valores de y
y_pred = regressor.predict(X_test)


In [1092]:
# importamos las metricas de evaluacion
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.20529127358490568
Mean Squared Error: 0.05936100501792945
Root Mean Squared Error: 0.24364113983054966


In [1093]:
# imprimir el r2
print('R2:', metrics.r2_score(y_test, y_pred)*100)

R2: -2.8834999413770657


In [1094]:
# ver los valores reales y los predichos
df = pd.DataFrame({'Real': y_test, 'Predicho': y_pred})
df

Unnamed: 0,Real,Predicho
408,4.6,4.6340
437,4.8,4.7750
485,4.5,4.6720
298,4.6,4.6450
508,4.5,4.0460
...,...,...
190,4.5,4.7740
357,4.5,4.6770
107,4.5,4.6330
65,4.5,4.8166


In [1095]:
# importamos los datos
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

In [1096]:
x_train.head()

Unnamed: 0,mes,año,latitud(º),longitud(º),profundidad_km,cluster_Kmeans,cluster_Kmedoids
0,12,1992,-13.12,-76.38,16.0,0,5
1,2,1984,-12.42,-76.94,53.0,2,0
2,10,2015,-10.61,-76.8352,127.0,1,1
3,1,1995,-12.95,-76.15,12.0,0,5
4,2,2003,-11.1248,-77.2045,126.0,1,1


In [1097]:
# contar cuantos cluster_Kmeans hay en x_train
x_train['cluster_Kmeans'].value_counts()


1    143
2    117
0     86
3     25
Name: cluster_Kmeans, dtype: int64

In [1098]:
# contar cuantos cluster_Kmeans hay en x_test
x_test['cluster_Kmeans'].value_counts()

2    56
1    54
0    32
3    17
Name: cluster_Kmeans, dtype: int64

In [1099]:
# contar cuantos cluster_Kmedoids hay en x_train
x_train['cluster_Kmedoids'].value_counts()

0    80
1    77
5    71
4    67
3    54
2    22
Name: cluster_Kmedoids, dtype: int64

In [1100]:
# contar cuantos cluster_Kmedoids hay en x_test
x_test['cluster_Kmedoids'].value_counts()

0    35
3    35
4    32
5    21
1    20
2    16
Name: cluster_Kmedoids, dtype: int64

## Modelo Random Forest con clusteres de Kmeans

In [1101]:
# importamos la data 
xc_train = pd.read_csv('X_train.csv')
yc_train = pd.read_csv('y_train.csv')
xc_test = pd.read_csv('X_test.csv')
yc_test = pd.read_csv('y_test.csv')

In [1102]:
xc_train.head()

Unnamed: 0,mes,año,latitud(º),longitud(º),profundidad_km,cluster_Kmeans,cluster_Kmedoids
0,12,1992,-13.12,-76.38,16.0,0,5
1,2,1984,-12.42,-76.94,53.0,2,0
2,10,2015,-10.61,-76.8352,127.0,1,1
3,1,1995,-12.95,-76.15,12.0,0,5
4,2,2003,-11.1248,-77.2045,126.0,1,1


In [1103]:
xc_test.head()

Unnamed: 0,mes,año,latitud(º),longitud(º),profundidad_km,cluster_Kmedoids,cluster_Kmeans
0,12,1989,-12.61,-76.66,46.0,0,2
1,7,2011,-11.6147,-76.6758,103.0,3,1
2,2,1946,-13.0,-76.2,80.0,2,3
3,3,2003,-12.7305,-75.753,87.0,4,1
4,12,2000,-11.9202,-76.9046,75.0,4,2


In [1104]:
# importamos el modelo de random forest
from sklearn.ensemble import RandomForestRegressor

# creamos un modelo de random forest para cada cluster de kmeans, en total hay 4 clusters
regressor1 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor2 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor3 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor4 = RandomForestRegressor(n_estimators = 100, random_state = 0)

# entrenamos el modelo de random forest para cada cluster de kmeans obviando las columnas de cluster_Kmeans y cluster_Kmedoids
regressor1.fit(xc_train[xc_train['cluster_Kmeans'] == 0].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmeans'] == 0].values.ravel())
regressor2.fit(xc_train[xc_train['cluster_Kmeans'] == 1].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmeans'] == 1].values.ravel())
regressor3.fit(xc_train[xc_train['cluster_Kmeans'] == 2].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmeans'] == 2].values.ravel())
regressor4.fit(xc_train[xc_train['cluster_Kmeans'] == 3].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmeans'] == 3].values.ravel())



RandomForestRegressor(random_state=0)

In [1105]:

# predecimos los valores de y para cada cluster de kmeans obviando las columnas de cluster_Kmeans y cluster_Kmedoids
y_pred1 = regressor1.predict(xc_test[xc_test['cluster_Kmeans'] == 0].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred2 = regressor2.predict(xc_test[xc_test['cluster_Kmeans'] == 1].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred3 = regressor3.predict(xc_test[xc_test['cluster_Kmeans'] == 2].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred4 = regressor4.predict(xc_test[xc_test['cluster_Kmeans'] == 3].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))

In [1106]:
# importamos las metricas de evaluacion
from sklearn import metrics

# imprimir el r2 para cada cluster de kmeans
print('R2 cluster 1 (KMeans):', metrics.r2_score(yc_test[xc_test['cluster_Kmeans'] == 0], y_pred1)*100)

print('R2 cluster 2 (KMeans):', metrics.r2_score(yc_test[xc_test['cluster_Kmeans'] == 1], y_pred2)*100)

print('R2 cluster 3 (KMeans):', metrics.r2_score(yc_test[xc_test['cluster_Kmeans'] == 2], y_pred3)*100)

print('R2 cluster 4 (KMeans):', metrics.r2_score(yc_test[xc_test['cluster_Kmeans'] == 3], y_pred4)*100)



R2 cluster 1 (KMeans): -64.75786353924995
R2 cluster 2 (KMeans): 55.987898462289486
R2 cluster 3 (KMeans): 14.897765402019003
R2 cluster 4 (KMeans): 9.15262727871996


## Modelo Random Forest con clusteres de Kmedoids

In [1107]:
# importamos la data 
xc_train = pd.read_csv('X_train.csv')
yc_train = pd.read_csv('y_train.csv')
xc_test = pd.read_csv('X_test.csv')
yc_test = pd.read_csv('y_test.csv')

In [1108]:
# importamos el modelo de random forest
from sklearn.ensemble import RandomForestRegressor

# creamos un modelo de random forest para cada cluster de kmedoids, en total hay 6 clusters
regressor_kmedoids1 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor_kmedoids2 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor_kmedoids3 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor_kmedoids4 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor_kmedoids5 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor_kmedoids6 = RandomForestRegressor(n_estimators = 100, random_state = 0)


In [1109]:
# entrenamos el modelo de random forest para cada cluster de kmedoids obviando las columnas de cluster_Kmeans y cluster_Kmedoids
regressor_kmedoids1.fit(xc_train[xc_train['cluster_Kmedoids'] == 0].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 0].values.ravel())
regressor_kmedoids2.fit(xc_train[xc_train['cluster_Kmedoids'] == 1].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 1].values.ravel())
regressor_kmedoids3.fit(xc_train[xc_train['cluster_Kmedoids'] == 2].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 2].values.ravel())
regressor_kmedoids4.fit(xc_train[xc_train['cluster_Kmedoids'] == 3].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 3].values.ravel())
regressor_kmedoids5.fit(xc_train[xc_train['cluster_Kmedoids'] == 4].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 4].values.ravel())
regressor_kmedoids6.fit(xc_train[xc_train['cluster_Kmedoids'] == 5].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1), yc_train[xc_train['cluster_Kmedoids'] == 5].values.ravel())


RandomForestRegressor(random_state=0)

In [1110]:
# predecimos los valores de y para cada cluster de kmedoids obviando las columnas de cluster_Kmeans y cluster_Kmedoids
y_pred_kmedoids1 = regressor_kmedoids1.predict(xc_test[xc_test['cluster_Kmedoids'] == 0].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred_kmedoids2 = regressor_kmedoids2.predict(xc_test[xc_test['cluster_Kmedoids'] == 1].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred_kmedoids3 = regressor_kmedoids3.predict(xc_test[xc_test['cluster_Kmedoids'] == 2].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred_kmedoids4 = regressor_kmedoids4.predict(xc_test[xc_test['cluster_Kmedoids'] == 3].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred_kmedoids5 = regressor_kmedoids5.predict(xc_test[xc_test['cluster_Kmedoids'] == 4].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))
y_pred_kmedoids6 = regressor_kmedoids6.predict(xc_test[xc_test['cluster_Kmedoids'] == 5].drop(['cluster_Kmeans', 'cluster_Kmedoids'], axis = 1))


In [1111]:
# imprimir el r2 para cada cluster de kmedoids
print('R2 cluster 1 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 0], y_pred_kmedoids1)*100)
print('R2 cluster 2 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 1], y_pred_kmedoids2)*100)
print('R2 cluster 3 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 2], y_pred_kmedoids3)*100)
print('R2 cluster 4 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 3], y_pred_kmedoids4)*100)
print('R2 cluster 5 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 4], y_pred_kmedoids5)*100)
print('R2 cluster 6 (KMedoids):', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 5], y_pred_kmedoids6)*100)

R2 cluster 1 (KMedoids): -9.856836768072942
R2 cluster 2 (KMedoids): 3.4617977528094146
R2 cluster 3 (KMedoids): -1.2074944721493397
R2 cluster 4 (KMedoids): 51.40907897740248
R2 cluster 5 (KMedoids): 13.86112094395332
R2 cluster 6 (KMedoids): -50.3986925497957


In [1112]:
# analizar los resultados del cluster 4 de kmedoids
print('Cluster 4 (KMedoids):')
print('R2:', metrics.r2_score(yc_test[xc_test['cluster_Kmedoids'] == 3], y_pred_kmedoids4)*100)
print('MAE:', metrics.mean_absolute_error(yc_test[xc_test['cluster_Kmedoids'] == 3], y_pred_kmedoids4))
print('MSE:', metrics.mean_squared_error(yc_test[xc_test['cluster_Kmedoids'] == 3], y_pred_kmedoids4))
print('RMSE:', np.sqrt(metrics.mean_squared_error(yc_test[xc_test['cluster_Kmedoids'] == 3], y_pred_kmedoids4)))


Cluster 4 (KMedoids):
R2: 51.40907897740248
MAE: 0.20434285714285716
MSE: 0.06951079999999996
RMSE: 0.2636490091011153


#### Pruebas al cluster 4 con Kmedoids

In [1113]:
# instalar folium
# folium es una librería de visualización de datos geográficos
!pip install folium



In [1114]:
# importar folium
import folium

In [1120]:
# graficar en un mapa de geolocalización todos los puntos del cluster 4 de kmedoids
# latitud lleva el nombre de latitud(º) y la columna longitud lleva el nombre de longitud(º)
mapa = folium.Map(location=[xc_test[xc_test['cluster_Kmedoids'] == 3]['latitud(º)'].mean(), xc_test[xc_test['cluster_Kmedoids'] == 3]['longitud(º)'].mean()], zoom_start=10)
for i in range(0,len(xc_test[xc_test['cluster_Kmedoids'] == 3])):
    folium.Marker([xc_test[xc_test['cluster_Kmedoids'] == 3].iloc[i]['latitud(º)'], xc_test[xc_test['cluster_Kmedoids'] == 3].iloc[i]['longitud(º)']], icon=folium.Icon(color='red')).add_to(mapa)

# dibujar de otro color los puntos que se encuentran en el cluster 4 de kmedoids en xc_train
for i in range(0,len(xc_train[xc_train['cluster_Kmedoids'] == 3])):
    folium.Marker([xc_train[xc_train['cluster_Kmedoids'] == 3].iloc[i]['latitud(º)'], xc_train[xc_train['cluster_Kmedoids'] == 3].iloc[i]['longitud(º)']], icon=folium.Icon(color='red')).add_to(mapa)
mapa


In [1116]:
#ver que tienen en común los puntos del cluster 4 de kmedoids
xc_test[xc_test['cluster_Kmedoids'] == 3].describe()

Unnamed: 0,mes,año,latitud(º),longitud(º),profundidad_km,cluster_Kmedoids,cluster_Kmeans
count,35.0,35.0,35.0,35.0,35.0,35.0,35.0
mean,7.571429,2016.514286,-11.978423,-76.62744,94.8,3.0,1.142857
std,3.220209,5.60132,0.536965,0.385732,10.168579,0.0,0.355036
min,1.0,2003.0,-13.0835,-77.5275,71.0,3.0,1.0
25%,6.0,2014.0,-12.30685,-76.8629,90.0,3.0,1.0
50%,8.0,2019.0,-11.9375,-76.5953,97.0,3.0,1.0
75%,10.0,2021.0,-11.71225,-76.39445,102.5,3.0,1.0
max,12.0,2022.0,-10.8132,-75.6515,113.0,3.0,2.0


In [1126]:
#unir xc_test y xc_train
xc = pd.concat([xc_test, xc_train])

In [1132]:
#año 2020 separar 
xc_2020 = xc[xc['año'] == 2020]

In [1138]:
xc_2020

Unnamed: 0,mes,año,latitud(º),longitud(º),profundidad_km,cluster_Kmedoids,cluster_Kmeans
19,12,2020,-11.8995,-77.0818,78.0,3,2
25,5,2020,-12.2226,-76.0473,107.0,3,1
32,7,2020,-11.2648,-77.0504,97.0,3,1
45,11,2020,-12.1062,-76.7902,99.0,3,1
80,4,2020,-11.8729,-77.0723,74.0,3,2
84,3,2020,-11.7079,-76.9556,90.0,3,1
131,9,2020,-12.0401,-76.9329,93.0,3,1
38,4,2020,-12.5229,-76.763,49.0,0,2
82,5,2020,-10.6209,-76.76,119.0,1,1
150,4,2020,-11.879,-76.8228,79.0,3,2


In [1137]:
# graficar en un mapa de geolocalización todos los puntos del cluster 4 de kmedoids ocurridos en el año 2020 de
# la data xc_2020
# latitud lleva el nombre de latitud(º) y la columna longitud lleva el nombre de longitud(º)

mapa = folium.Map(location=[xc_2020[xc_2020['cluster_Kmedoids'] == 3]['latitud(º)'].mean(), xc_2020[xc_2020['cluster_Kmedoids'] == 3]['longitud(º)'].mean()], zoom_start=10)
for i in range(0,len(xc_2020[xc_2020['cluster_Kmedoids'] == 3])):
    folium.Marker([xc_2020[xc_2020['cluster_Kmedoids'] == 3].iloc[i]['latitud(º)'], xc_2020[xc_2020['cluster_Kmedoids'] == 3].iloc[i]['longitud(º)']], icon=folium.Icon(color='red')).add_to(mapa)
mapa 

In [1121]:
# ver la magnitud de los sismos del cluster 4 de kmedoids en y_test
yc_test[xc_test['cluster_Kmedoids'] == 3].describe()

Unnamed: 0,magnitud_Mw
count,35.0
mean,4.425714
std,0.383745
min,3.9
25%,4.0
50%,4.5
75%,4.7
max,5.4
