In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

In [23]:
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [24]:
# Creación de un cluster local H2O
# ------------------------------------------------------------------------------
h2o.init(ip = "localhost",
         # -1 indica que se empleen todos los cores disponibles.
         nthreads = -1,
         # Máxima memoria disponible para el cluster.
         max_mem_size = "4g")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,2 mins 52 secs
H2O_cluster_timezone:,Europe/Madrid
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,3 months and 3 days
H2O_cluster_name:,H2O_from_python_ACER_zzy1xd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.658 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [25]:
# Se eliminan los datos del cluster por si ya había sido iniciado.
h2o.remove_all()

In [26]:
df_1=pd.read_csv('./training set values.csv', sep=',')
df_target=pd.read_csv('./training set labels.csv', sep=',')
df_all_1 = pd.merge(df_1, df_target, on='id', how='inner')


In [27]:
df_all_1.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [28]:
df_all_1 = h2o.H2OFrame(python_obj = df_1, destination_frame = "df_bank_h2o")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [29]:
df_all_1.type

<bound method H2OFrame.type of H2OFrame({'_ex': <Expr()#df_bank_h2o>})>

In [30]:
df_all_1.shape

(59400, 40)

In [31]:
df_all_1.col_names

['id',
 'amount_tsh',
 'date_recorded',
 'funder',
 'gps_height',
 'installer',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'subvillage',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [32]:
df_all_1.describe()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
type,int,real,time,enum,int,enum,real,real,enum,int,enum,enum,enum,int,int,enum,enum,int,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum
mins,0.0,0.0,1034553600000.0,,-90.0,,0.0,-11.64944018,,0.0,,,,1.0,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,,,,
mean,37115.13176767676,317.65038468013483,1333012293818.18,,668.2972390572386,,34.077426692028816,-5.706032659626429,,0.47414141414141414,,,,15.297003367003365,5.629747474747471,,,179.9099831649831,,,,,,1300.6524747474762,,,,,,,,,,,,,,,,
maxs,74247.0,350000.0,1386028800000.0,,2770.0,,40.34519307,-2e-08,,1776.0,,,,99.0,80.0,,,30500.0,,,,,,2013.0,,,,,,,,,,,,,,,,
sigma,21453.12837131774,2997.5745581421675,28876294717.270092,,693.11635032505,,6.567431845646528,2.946019081267253,,12.236229810496688,,,,17.587406337332038,9.633648629454573,,,471.48217573848007,,,,,,951.6205473151736,,,,,,,,,,,,,,,,
zeros,1,41639,0,,20438,,1812,0,,58643,,,,0,23,,,21381,,,,,,20709,,,,,,,,,,,,,,,,
missing,0,0,0,777,0,777,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,69572.0,6000.0,2011-03-14 00:00:00,Roman,1390.0,Roman,34.93809275,-9.85632177,none,0.0,Lake Nyasa,Mnyusi B,Iringa,11.0,5.0,Ludewa,Mundindi,109.0,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776.0,0.0,2013-03-06 00:00:00,Grumeti,1399.0,GRUMETI,34.6987661,-2.14746569,Zahanati,0.0,Lake Victoria,Nyamara,Mara,20.0,2.0,Serengeti,Natta,280.0,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310.0,25.0,2013-02-25 00:00:00,Lottery Club,686.0,World vision,37.46066446,-3.82132853,Kwa Mahundi,0.0,Pangani,Majengo,Manyara,21.0,4.0,Simanjiro,Ngorika,250.0,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe


In [33]:
df_all_1['status_group'].table()

H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Column status_group not found
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_1_sid_9cfc (table (cols_py df_bank_h2o 'status_group') True))", 'session_id': '_sid_9cfc'}


H2OResponseError: Server error water.exceptions.H2OKeyNotFoundArgumentException:
  Error: Object 'py_1_sid_9cfc' not found for argument: key
  Request: GET /3/Frames/py_1_sid_9cfc
    params: {'row_count': '10', 'row_offset': '0', 'column_count': '-1', 'full_column_count': '-1', 'column_offset': '0'}
