In [89]:
import pandas as pd
import numpy
from google.cloud import storage
import gcsfs

fs = gcsfs.GCSFileSystem(project='tron-argolis', token='/home/sockcop/k/k.json')

with fs.open('gs://tron-argolis-dataset/kagglev2-may-2016.csv') as f:
    gcs_df = pd.read_csv(f)

gcs_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [53]:
print(gcs_df['No-show'] == 'Yes')
gcs_df['output_label'] = (gcs_df['No-show'] == 'Yes').astype('int')

0         False
1         False
2         False
3         False
4         False
          ...  
110522    False
110523    False
110524    False
110525    False
110526    False
Name: No-show, Length: 110527, dtype: bool


In [57]:
print(gcs_df['ScheduledDay'])
gcs_df['ScheduledDay'] = pd.to_datetime(gcs_df['ScheduledDay'], format='%Y-%m-%dT%H-%M-%SZ', errors='coerce')
print(gcs_df['ScheduledDay'])
assert gcs_df.ScheduledDay.isnull().sum() == 0, 'missing ScheduledDay dates'

0        NaT
1        NaT
2        NaT
3        NaT
4        NaT
          ..
110522   NaT
110523   NaT
110524   NaT
110525   NaT
110526   NaT
Name: ScheduledDay, Length: 110527, dtype: datetime64[ns]
0        NaT
1        NaT
2        NaT
3        NaT
4        NaT
          ..
110522   NaT
110523   NaT
110524   NaT
110525   NaT
110526   NaT
Name: ScheduledDay, Length: 110527, dtype: datetime64[ns]


AssertionError: missing ScheduledDay dates

In [62]:
print(gcs_df['ScheduledDay'].dt.year)

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
110522   NaN
110523   NaN
110524   NaN
110525   NaN
110526   NaN
Name: ScheduledDay, Length: 110527, dtype: float64


In [42]:
def calc_prev(y):
    return (sum(y)/len(y))

calc_prev(gcs_df.output_label.values)

0.20193255946510807

In [44]:
print((gcs_df['ScheduledDay']>gcs_df['AppointmentDay']).sum())

38568


In [134]:
import kfp
from kfp.v2.dsl import component, Input, Output, Artifact
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/sockcop/k/k.json'
client = kfp.Client(host='https://11f0f736521255cb-dot-us-central1.pipelines.googleusercontent.com')

@component(packages_to_install=['gcsfs', 'pandas', 'numpy'])
def data_preprocess(dataset: str, output_path: Output[Artifact]):
    import pandas as pd
    import numpy

    df = pd.read_csv(dataset)
    df['output_label'] = (df['No-show'] == 'Yes').astype('int')
    df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], 
                                          format = '%Y-%m-%dT%H:%M:%SZ', 
                                          errors = 'coerce') 
    df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], 
                                          format = '%Y-%m-%dT%H:%M:%SZ', 
                                          errors = 'coerce')

    assert df.ScheduledDay.isnull().sum() == 0, 'missing ScheduledDay dates'
    assert df.AppointmentDay.isnull().sum() == 0, 'missing AppointmentDay dates'
    
    df['AppointmentDay'] = df['AppointmentDay'] +pd.Timedelta('1d') - pd.Timedelta('1s')
    
    df['ScheduledDay_year'] = df['ScheduledDay'].dt.year
    df['ScheduledDay_month'] = df['ScheduledDay'].dt.month
    df['ScheduledDay_week'] = df['ScheduledDay'].dt.week
    df['ScheduledDay_day'] = df['ScheduledDay'].dt.day
    df['ScheduledDay_hour'] = df['ScheduledDay'].dt.hour
    df['ScheduledDay_minute'] = df['ScheduledDay'].dt.minute
    df['ScheduledDay_dayofweek'] = df['ScheduledDay'].dt.dayofweek
    
    print('break')
    df['AppointmentDay_year'] = df['AppointmentDay'].dt.year
    df['AppointmentDay_month'] = df['AppointmentDay'].dt.month
    df['AppointmentDay_week'] = df['AppointmentDay'].dt.week
    df['AppointmentDay_day'] = df['AppointmentDay'].dt.day
    df['AppointmentDay_hour'] = df['AppointmentDay'].dt.hour
    df['AppointmentDay_minute'] = df['AppointmentDay'].dt.minute
    df['AppointmentDay_dayofweek'] = df['AppointmentDay'].dt.dayofweek
    
    df['delta_days'] = (df['AppointmentDay']-df['ScheduledDay']).dt.total_seconds()/(60*60*24)
    
    df = df.sample(n = len(df), random_state = 42)
    df = df.reset_index(drop = True)

    with open(output_path.path, 'w') as f:
        df.to_csv(f)
    
@component(packages_to_install=['gcsfs', 'pandas', 'numpy', 'sklearn'])
def training(input_path: Input[Artifact]):
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    
    with open(input_path.path, 'r') as f:
        df = pd.read_csv(f)
    
    col2use = ['ScheduledDay_day', 
               'ScheduledDay_hour',
               'ScheduledDay_minute', 
               'ScheduledDay_dayofweek', 
               'AppointmentDay_day',
               'AppointmentDay_dayofweek', 
               'delta_days']
        
    df_valid = df.sample(frac = 0.3, random_state = 42)
    df_train = df.drop(df_valid.index)
    
    X_train = df_train[col2use].values
    X_valid = df_valid[col2use].values
    y_train = df_train['output_label'].values
    y_valid = df_valid['output_label'].values
    print('Training shapes:',X_train.shape, y_train.shape)
    print('Validation shapes:',X_valid.shape, y_valid.shape)
    
    from sklearn.ensemble import RandomForestClassifier
    rf=RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    y_train_preds = rf.predict_proba(X_train)[:,1]
    y_train_preds = rf.predict_proba(X_valid)[:,1]
    
@kfp.dsl.pipeline(name='operation')
def pipeline_op():
    data_prep_task = data_preprocess('gs://tron-argolis-dataset/kagglev2-may-2016.csv')
    training_task = training(data_prep_task.output)

In [135]:
client.create_run_from_pipeline_func(pipeline_op, 
                                     arguments={}, 
                                     mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE)



RunPipelineResult(run_id=d44e72ee-3c93-4f66-8d92-64d24765f997)

In [125]:
%%bash

pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1309 sha256=e2a35e9f3176868ed375876f50db5bcff3a27298d7ba3619b61692ae954b0915
  Stored in directory: /home/sockcop/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: threadpoolctl, scipy, joblib, sc

In [127]:
from sklearn.ensemble import RandomForestClassifier
RandomforestClassifier()

NameError: name 'RandomforestClassifier' is not defined

In [133]:
    import pandas as pd
    import numpy
    import gcsfs

    fs = gcsfs.GCSFileSystem(project='tron-argolis', token='/home/sockcop/k/k.json')

    with fs.open('gs://tron-argolis-dataset/kagglev2-may-2016.csv') as f:
        df = pd.read_csv(f)
    
    df['output_label'] = (df['No-show'] == 'Yes').astype('int')
    df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], 
                                          format = '%Y-%m-%dT%H:%M:%SZ', 
                                          errors = 'coerce') 
    df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], 
                                          format = '%Y-%m-%dT%H:%M:%SZ', 
                                          errors = 'coerce')

    assert df.ScheduledDay.isnull().sum() == 0, 'missing ScheduledDay dates'
    assert df.AppointmentDay.isnull().sum() == 0, 'missing AppointmentDay dates'
    
    df['AppointmentDay'] = df['AppointmentDay'] +pd.Timedelta('1d') - pd.Timedelta('1s')
    
    df['ScheduledDay_year'] = df['ScheduledDay'].dt.year
    df['ScheduledDay_month'] = df['ScheduledDay'].dt.month
    df['ScheduledDay_week'] = df['ScheduledDay'].dt.week
    df['ScheduledDay_day'] = df['ScheduledDay'].dt.day
    df['ScheduledDay_hour'] = df['ScheduledDay'].dt.hour
    df['ScheduledDay_minute'] = df['ScheduledDay'].dt.minute
    df['ScheduledDay_dayofweek'] = df['ScheduledDay'].dt.dayofweek
    
    print('break')
    df['AppointmentDay_year'] = df['AppointmentDay'].dt.year
    df['AppointmentDay_month'] = df['AppointmentDay'].dt.month
    df['AppointmentDay_week'] = df['AppointmentDay'].dt.week
    df['AppointmentDay_day'] = df['AppointmentDay'].dt.day
    df['AppointmentDay_hour'] = df['AppointmentDay'].dt.hour
    df['AppointmentDay_minute'] = df['AppointmentDay'].dt.minute
    df['AppointmentDay_dayofweek'] = df['AppointmentDay'].dt.dayofweek
    
    df['delta_days'] = (df['AppointmentDay']-df['ScheduledDay']).dt.total_seconds()/(60*60*24)
    
    df = df.sample(n = len(df), random_state = 42)
    df = df.reset_index(drop = True)
    
    col2use = ['ScheduledDay_day', 
               'ScheduledDay_hour',
               'ScheduledDay_minute', 
               'ScheduledDay_dayofweek', 
               'AppointmentDay_day',
               'AppointmentDay_dayofweek', 
               'delta_days']
        
    df_valid = df.sample(frac = 0.3, random_state = 42)
    df_train = df.drop(df_valid.index)
    
    X_train = df_train[col2use].values
    X_valid = df_valid[col2use].values
    y_train = df_train['output_label'].values
    y_valid = df_valid['output_label'].values
    print('Training shapes:',X_train.shape, y_train.shape)
    print('Validation shapes:',X_valid.shape, y_valid.shape)
    
    from sklearn.ensemble import RandomForestClassifier
    rf=RandomforestClassifier(max_depth=5, n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    y_train_preds = rf.predict_proba(X_train)[:,1]
    y_train_preds = rf.predict_proba(X_valid)[:,1]
    

break
Training shapes: (77369, 7) (77369,)
Validation shapes: (33158, 7) (33158,)




NameError: name 'RandomforestClassifier' is not defined