In [13]:
import argparse
import os
import glob
import shutil
import joblib
import numpy as np
import pandas as pd
from pandas import read_csv

from sklearn import __version__ as sklearnver
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from azureml.core.model import Model
from azureml.core.resource_configuration import ResourceConfiguration

from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset
from packaging.version import Version

In [14]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

distributeddeeplearningqmx
deep-learning-challenge
westus2
3df1840f-dd4b-4f54-a831-e20536439b3a


In [15]:
import os
# List the files in the mounted path
print(os.listdir("/home/azureuser/cloudfiles/data/datastore/generalpurposeaccount"))

# Get the path of your file and load the data using your preferred libraries
# import pandas as pd
# df = pd.read_csv("/home/azureuser/cloudfiles/data/datastore/generalpurposeaccount/{path_to_file}/{your_file}")
# print(df.head(5))

['titanic_test.parquet', 'UI', 'azureml', 'dataset-demo', 'nlp_automl', 'pdf_titanic_3.csv', 'pipeline_inputdataset', 'temp_delta', 'titanic_dataset', 'titanic_feature', 'tweet.py']


In [16]:
import pandas as pd

In [17]:
pdf_titanic_raw = pd.read_parquet('/home/azureuser/cloudfiles/data/datastore/generalpurposeaccount/titanic_test.parquet')


In [18]:
pdf_titanic_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
pdf_titanic_raw.shape

(891, 12)

In [24]:
def featurize_data(preped_data):
    preped_data = preped_data.copy()
    
    gender_labels = {'male':0,'female':1}
    preped_data['Sex'] = preped_data['Sex'].replace({'male':0,'female':1})

    preped_data = preped_data.drop(['Name','Ticket','Cabin','Embarked'],axis =1)
    preped_data['Age'] = preped_data['Age'].fillna(preped_data['Age'].mean())

    X = preped_data.drop(['Survived'],axis =1)   #dropped unnecessary columns
    y_train = preped_data['Survived']
    num_columns = list(X.columns)

    ct = make_column_transformer(
        (MinMaxScaler(), num_columns),
        (StandardScaler(), num_columns),
        remainder='passthrough'
    )

    X_features = ct.fit_transform(X)
    return preped_data, X_features, y_train

In [25]:
preped_data, X_features, y_train = featurize_data(pdf_titanic_raw)


In [26]:

import mlflow

mlflow.sklearn.autolog()

experiment_name = "Titanic_Experiment"
experiment_id = mlflow.create_experiment(experiment_name)

with mlflow.start_run(experiment_id=experiment_id) as run:
    log_reg = LogisticRegression(solver='liblinear', max_iter=100, penalty='l2', tol= 0.0001)
    fitted_model = log_reg.fit(X_features, y_train)

    isdir = os.path.isdir("outputs")
    if isdir:
        shutil.rmtree("outputs")
    mlflow.sklearn.save_model(fitted_model, "outputs")

In [27]:
preped_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,0,22.0,1,0,7.25
1,2,1,1,1,38.0,1,0,71.2833
2,3,1,3,1,26.0,0,0,7.925
3,4,1,1,1,35.0,1,0,53.1
4,5,0,3,0,35.0,0,0,8.05


In [9]:
datastore = ws.datastores['generalpurposeaccount']

ds_X_train = Dataset.Tabular.register_pandas_dataframe(dataframe=preped_data, target=(datastore, 'titanic'), name="titanic_preped_data")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to titanic/611cec62-757f-4550-bc72-c52fd800c130/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [10]:
ds_titanic_raw = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_titanic_raw, target=(datastore, 'titanic'), name="titanic_raw")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to titanic/8f4ab2a9-4f06-4052-8ea4-c36e150f0419/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [12]:
model = Model.register(model_path='outputs',
                        model_name='titanic_model',
                        datasets=[('raw_data', ds_titanic_raw), ('preped_data', ds_X_train)],
                        description="Titanic survival classification model",
                        resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                        workspace=ws)
                        

Registering model titanic_model


### Hyper Parameter Tunning

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

parameters = {'solver':('liblinear', 'rbf'), 
              'penalty':('l1', 'l2'), 
              'tol' :[0.0001, 0.0002], 
              'max_iter': [50, 100]}
              
log_reg = LogisticRegression

with mlflow.start_run(experiment_id=experiment_id) as run:
    clf = GridSearchCV(log_reg, parameters)
    clf.fit(X_features, y_train)


In [36]:
30*20

600