# Submission 2. Fast-ai

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [1]:
import pandas as pd
from pathlib import Path
import os
from fastai.tabular.all import *

In [2]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/titanic')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('titanic')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)



In [3]:
pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

In [4]:
# Read train and test files into pandas dataframes
df = pd.read_csv('train.csv')
df_submission = pd.read_csv('test.csv')

# Preprocessing


Feature Engineering

In [5]:
def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df)

# Modeling

Prepare Dataloader

In [13]:
# Create splits
splits = RandomSplitter(seed=42)(df)
splits

((#713) [788,525,821,253,374,98,215,313,281,305...],
 (#178) [303,778,531,385,134,476,691,443,386,128...])

In [7]:
# Create DataLoader
dls = TabularPandas(
    df, splits=splits,
    procs = [Categorify, FillMissing, Normalize],
    cat_names=["Sex","Pclass","Embarked","Deck", "Title"],b
    cont_names=['Age', 'SibSp', 'Parch', 'LogFare',
                 'Alone', 'TicketFreq', 'Family'],b
    y_names="Survived", y_block = CategoryBlock(),
).dataloaders(path=".")

In [10]:
L(dls.items).map(dls.tfms[1])

AttributeError: tfms

Model Building

In [None]:
# Create learner. Size of each hidden layer ([10,10])
learn = tabular_learner(dls, layers=[10,10], metrics=accuracy)

In [None]:
# Finding google learning rate
learn.lr_find(suggest_funcs=(slide, valley))

# Train the model

In [None]:
learn.fit(16, lr=0.03)

# Save model

In [None]:
learn.export('./models/fastai_tabular')
del learn

# Load Model

In [None]:
learn=load_learner('./models/fastai_tabular')

# Submit to Kaggle

Prepare the data

In [None]:
def submit_to_kaggle(path_submission_csv_read, learn,name_final_submission_csv_write,message):
    df_submission = pd.read_csv(path_submission_csv_read) # Read test file
    df_submission['Fare'] = df_submission.Fare.fillna(0)# There is one Fare missing
    add_features(df_submission) # Add features to test dataframe
    dl = learn.dls.test_dl(df_submission) # Create DataLoader for test dataframe from the configurations of the learner
    preds,_ = learn.get_preds(dl=dl) # Get predictions
    df_submission['Survived'] = (preds[:,1]>0.5).int() # Add predictions to dataframe
    df_submission = df_submission[['PassengerId','Survived']] # Create submission dataframe
    df_submission.to_csv(name_final_submission_csv_write, index=False) # Save submission dataframe to csv file
    os.system("kaggle competitions submit -c titanic -f "+name_final_submission_csv_write+" -m "+f"\"message\"")

In [None]:
df_submission = pd.read_csv(path/'test.csv') # Read test file
df_submission['Fare'] = df_submission.Fare.fillna(0)# There is one Fare missing
add_features(df_submission) # Add features to test dataframe
dl_test = learn.dls.test_dl(df_submission) # Create DataLoader for test dataframe from the configurations of the learner

Get predictions

In [None]:
preds,_=learn.get_preds(dl=dl_test) # Get predictions

Create submission file

In [None]:
df_submission['Survived'] = (preds[:,1]>0.5).int() # Add predictions to dataframe
sub_df_submission = df_submission[['PassengerId','Survived']] # Create submission dataframe
sub_df_submission.to_csv('sub2.csv', index=False) # Save submission dataframe to csv file

In [None]:
!head sub.csv

In [None]:
# !kaggle competitions submit -c titanic -f sub2.csv -m "Fastai Tabular Learner with 10,10 layers. 16 epochs. 0.03 learning rate. 0.5 threshold."

# Permutation Importance. NOT WORKING

In [None]:
class PermutationImportance():
  "Calculate and plot the permutation importance"
  def __init__(self, learn:Learner, df=None, bs=None):
    "Initialize with a test dataframe, a learner, and a metric"
    self.learn = learn
    self.df = df if df is not None else None
    bs = bs if bs is not None else learn.dls.bs
    self.dl = learn.dls.test_dl(self.df, bs=bs) if self.df is not None else learn.dls[1]
    self.x_names = learn.dls.x_names.filter(lambda x: '_na' not in x)
    self.na = learn.dls.x_names.filter(lambda x: '_na' in x)
    self.y = dls.y_names
    self.results = self.calc_feat_importance()
    self.plot_importance(self.ord_dic_to_df(self.results))

  def measure_col(self, name:str):
    "Measures change after column shuffle"
    col = [name]
    if f'{name}_na' in self.na: col.append(name)
    orig = self.dl.items[col].values
    perm = np.random.permutation(len(orig))
    self.dl.items[col] = self.dl.items[col].values[perm]
    metric = learn.validate(dl=self.dl)[1]
    self.dl.items[col] = orig
    return metric

  def calc_feat_importance(self):
    "Calculates permutation importance by shuffling a column on a percentage scale"
    print('Getting base error')
    base_error = self.learn.validate(dl=self.dl)[1]
    self.importance = {}
    pbar = progress_bar(self.x_names)
    print('Calculating Permutation Importance')
    for col in pbar:
      self.importance[col] = self.measure_col(col)
    for key, value in self.importance.items():
      self.importance[key] = (base_error-value)/base_error #this can be adjusted
    return OrderedDict(sorted(self.importance.items(), key=lambda kv: kv[1], reverse=True))

  def ord_dic_to_df(self, dict:OrderedDict):
    return pd.DataFrame([[k, v] for k, v in dict.items()], columns=['feature', 'importance'])

  def plot_importance(self, df:pd.DataFrame, limit=20, asc=False, **kwargs):
    "Plot importance with an optional limit to how many variables shown"
    df_copy = df.copy()
    df_copy['feature'] = df_copy['feature'].str.slice(0,25)
    df_copy = df_copy.sort_values(by='importance', ascending=asc)[:limit].sort_values(by='importance', ascending=not(asc))
    ax = df_copy.plot.barh(x='feature', y='importance', sort_columns=True, **kwargs)
    for p in ax.patches:
      ax.annotate(f'{p.get_width():.4f}', ((p.get_width() * 1.005), p.get_y()  * 1.005))

# Bayesian Optimization

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
def fit_with(lr:float, wd:float, dp:float, n_layers:float, layer_1:float, layer_2:float, layer_3:float):

    print(lr, wd, dp)
    if round(n_layers) == 2:
        layers = [round(layer_1), round(layer_2)]
    elif int(n_layers) == 3:
        layers = [round(layer_1), round(layer_2), round(layer_3)]
    else:
        layers = [round(layer_1)]
    config = tabular_config(embed_p=float(dp),ps=float(wd))
    learn = tabular_learner(dls, layers=layers, metrics=accuracy, config = config)

    with learn.no_bar() and learn.no_logging():
        learn.fit(5, lr=float(lr))

    acc = float(learn.validate()[1])

    return acc

In [None]:
hps = {'lr': (1e-05, 1e-01),
      'wd': (4e-4, 0.4),
      'dp': (0.01, 0.5),
       'n_layers': (1,3),
       'layer_1': (5, 200),
       'layer_2': (5, 1000),
       'layer_3': (5, 2000)}

In [None]:
optim = BayesianOptimization(
    f = fit_with, # our fit function
    pbounds = hps, # our hyper parameters to tune
    verbose = 2, # 1 prints out when a maximum is observed, 0 for silent
    random_state=1
)

In [None]:
%time optim.maximize(n_iter=10)

In [None]:
print(optim.max)

In [None]:
dp=float(optim.max['params']['dp'])
wd=float(optim.max['params']['wd'])
lr=float(round(optim.max['params']['lr'],3))
n_layers = int(np.floor(optim.max['params']['n_layers']))  
layers=[int(np.floor(optim.max['params'][f'layer_{i}'])) for i in range(1,n_layers+2)]
print(f'layers:{layers}') 
print(f'lr: {lr}')

In [None]:
config = tabular_config(embed_p=dp,ps=wd)
learn = tabular_learner(dls, layers=layers, metrics=accuracy, config = config)
# learn = tabular_learner(dls, layers=layers, metrics=accuracy)

In [None]:
learn.fit(16, lr=lr)

In [None]:
# Submit to Kaffle
submit_to_kaggle(path/'test.csv', learn,'submission2_Fastai_Tabular_bayes_opt.csv','Fastai Tabular Learner with Bayes Optimization')