# Model

In [1]:
import pandas as pd

train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [2]:
# Feature Engineering 

# From : https://www.kaggle.com/pavelvpster/cat-in-dat-solution-1/

# Use one hot encoding for the features below
ohe_features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
                'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
                'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
                'day', 'month'] 

# Save the columns that will be used for feature aggregates
fa_features = [
    'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
    'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
    'nom_5', 'nom_6'
]

train_fa = train[fa_features].copy()
test_fa = test[fa_features].copy()

# Use Label Encoder for the remaining features
le_features = list(set(test.columns) - set(ohe_features))

train_size = len(train)
df = pd.get_dummies(pd.concat([train, test], axis=0, sort=True), 
                    columns=ohe_features, drop_first=True)
train = df[:train_size]
test = df[train_size:].drop('target', axis=1)
del df

In [3]:
from sklearn.preprocessing import LabelEncoder

def encode_categorial_features_fit(df, columns_to_encode):
    encoders = {}
    for c in columns_to_encode:
        if c in df.columns:
            encoder = LabelEncoder()
            encoder.fit(df[c].astype(str).values)
            encoders[c] = encoder
    return encoders

def encode_categorial_features_transform(df, encoders):
    out = pd.DataFrame(index=df.index)
    for c in encoders.keys():
        if c in df.columns:
            out[c] = encoders[c].transform(df[c].astype(str).values)
    return out


In [4]:
categorical_features_encoders = encode_categorial_features_fit(
    pd.concat([train, test], join='outer', sort=False), le_features)

In [5]:
temp = encode_categorial_features_transform(train, categorical_features_encoders)
columns_to_drop = list(set(le_features) & set(train.columns))
train = train.drop(columns_to_drop, axis=1).merge(temp, how='left', left_index=True, right_index=True)

In [6]:
temp = encode_categorial_features_transform(test, categorical_features_encoders)
columns_to_drop = list(set(le_features) & set(test.columns))
test = test.drop(columns_to_drop, axis=1).merge(temp, how='left', left_index=True, right_index=True)
del temp

In [7]:
# Target Encoding ( for categorical features with a high cardinality )

from category_encoders import TargetEncoder

te_features = [
    'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
]

te = TargetEncoder(cols=te_features, drop_invariant=True, return_df=True, min_samples_leaf=2, smoothing=1.0)
te.fit(train[te_features], train['target'])

temp = te.transform(train[te_features])
columns_to_drop = list(set(te_features) & set(train.columns))
train = train.drop(columns_to_drop, axis=1).merge(temp, how='left', left_index=True, right_index=True)
del temp

temp = te.transform(test[te_features])
columns_to_drop = list(set(te_features) & set(test.columns))
test = test.drop(columns_to_drop, axis=1).merge(temp, how='left', left_index=True, right_index=True)
del temp

In [8]:
import seaborn as sns

sns.distplot(train['nom_5'])

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<matplotlib.axes._subplots.AxesSubplot at 0x117dfa590>

In [9]:
# Feature aggregates


def make_aggregates(df, feature_to_group_by, feature):
    out = pd.DataFrame(index=df.index)
    agg = df.groupby([feature_to_group_by])[feature].value_counts(normalize=True)
    freq = lambda row: agg.loc[row[feature_to_group_by], row[feature]]
    out[feature + '_' + feature_to_group_by + '_freq'] = df.apply(freq, axis=1)
    return out


import itertools


features_to_group_by = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
features = ['nom_5', 'nom_6']

for pair in itertools.product(features_to_group_by, features):
    print('Add aggregates of', pair[1], 'by', pair[0])
    
    agg = make_aggregates(train_fa, pair[0], pair[1])
    train = train.merge(agg, how='left', left_index=True, right_index=True)
    del agg
    
    agg = make_aggregates(test_fa, pair[0], pair[1])
    test = test.merge(agg, how='left', left_index=True, right_index=True)
    del agg


Add aggregates of nom_5 by bin_0
Add aggregates of nom_6 by bin_0
Add aggregates of nom_5 by bin_1
Add aggregates of nom_6 by bin_1
Add aggregates of nom_5 by bin_2
Add aggregates of nom_6 by bin_2
Add aggregates of nom_5 by bin_3
Add aggregates of nom_6 by bin_3
Add aggregates of nom_5 by bin_4
Add aggregates of nom_6 by bin_4
Add aggregates of nom_5 by nom_0
Add aggregates of nom_6 by nom_0
Add aggregates of nom_5 by nom_1
Add aggregates of nom_6 by nom_1
Add aggregates of nom_5 by nom_2
Add aggregates of nom_6 by nom_2
Add aggregates of nom_5 by nom_3
Add aggregates of nom_6 by nom_3
Add aggregates of nom_5 by nom_4
Add aggregates of nom_6 by nom_4


In [10]:
# Use Auto ML
from azureml.train.automl import AutoMLConfig

# AutoML Configuration
automl_classifier = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    max_time_sec=3600,
    iterations=50,
    blacklist_models=['XGBoostClassifier'],
    X=train.drop(columns='target'),
    y=train['target'],
    n_cross_validations=3)




In [12]:
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

# Create the workspace using the specified parameters
'''
ws = Workspace.create(name = '',
                      subscription_id = '',
                      resource_group = '', 
                      location = 'West Europe',
                      create_resource_group = False)
ws.get_details()
# write the details of the workspace to a configuration file to the notebook library
ws.write_config()
'''
ws = Workspace.from_config()

# Choose a name for the experiment and specify the project folder.
experiment_name = 'automl-classification'

experiment = Experiment(ws, experiment_name)

run = experiment.submit(automl_classifier, show_output=True)



Running on local machine
Parent Run ID: AutoML_c8b5c907-57e1-451f-ab5c-84e946a73ae1
Current status: DatasetCrossValidationSplit. Generating CV splits.

****************************************************************************************************
DATA GUARDRAILS SUMMARY:
For more details, use API: run.get_guardrails()

TYPE:         Class Balancing Detection
STATUS:       PASSED
DESCRIPTION:  Classes are balanced in the training data.

****************************************************************************************************
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
***********************************



RobustScaler LogisticRegression                0:01:45       0.8333    0.8333
ERROR: Run AutoML_c8b5c907-57e1-451f-ab5c-84e946a73ae1_34 failed with exception "tuple index out of range".
        35   

--- Logging error ---
--- Logging error ---
--- Logging error ---


RobustScaler LogisticRegression                0:00:52       0.8208    0.8333
ERROR: Run AutoML_c8b5c907-57e1-451f-ab5c-84e946a73ae1_35 failed with exception "tuple index out of range".
        36   



                                               0:00:44          nan    0.8333
        37                                                  0:00:18          nan    0.8333
        38                                                  0:00:35          nan    0.8333
        39   

--- Logging error ---
--- Logging error ---
Traceback (most recent call last):
  File "/Users/houssam/anaconda3/envs/myenv/lib/python3.7/site-packages/ipykernel/iostream.py", line 97, in _event_pipe
    event_pipe = self._local.event_pipe
AttributeError: '_thread._local' object has no attribute 'event_pipe'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/houssam/anaconda3/envs/myenv/lib/python3.7/logging/__init__.py", line 1028, in emit
    stream.write(msg + self.terminator)
  File "/Users/houssam/anaconda3/envs/myenv/lib/python3.7/site-packages/ipykernel/iostream.py", line 402, in write
    self.pub_thread.schedule(lambda : self._buffer.write(string))
  File "/Users/houssam/anaconda3/envs/myenv/lib/python3.7/site-packages/ipykernel/iostream.py", line 205, in schedule
    self._event_pipe.send(b'')
  File "/Users/houssam/anaconda3/envs/myenv/lib/python3.7/site-packages/ipykernel/iostream.py", line 101, in _event_p



MaxAbsScaler LogisticRegression                0:01:01       0.8333    0.8333
ERROR: Run AutoML_c8b5c907-57e1-451f-ab5c-84e946a73ae1_39 failed with exception "tuple index out of range".
        40                                                  0:00:07          nan    0.8333
        41                                                  0:00:34          nan    0.8333
        42                                                  0:00:24          nan    0.8333
        43                                                  0:00:12          nan    0.8333
        44                                                  0:00:10          nan    0.8333
        45                                                  0:00:05          nan    0.8333
        46                                                  0:00:03          nan    0.8333
        47                                                  0:00:14          nan    0.8333
Received interrupt. Returning now.

In [14]:
best_run, fitted_model = run.get_output()

In [15]:
# Predict on the Test set
y_pred = fitted_model.predict(test)

In [16]:
# Create and save submission DF
pred_df = pd.DataFrame({'id': test.index, 'target':y_pred})
pred_df.to_csv('data/submit_automl.csv', index=False)