# Submission 2. Fast-ai

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [22]:
import pandas as pd
from pathlib import Path
import os
from fastai.tabular.all import *
from sklearn.model_selection import KFold

In [23]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/titanic')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('titanic')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [24]:
pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

In [25]:
# Read train and test files into pandas dataframes
df = pd.read_csv('train.csv')
df_target = pd.read_csv('test.csv')

# Preprocessing


Feature Engineering

In [26]:
def add_features(df):
    df['Fare'] = df.Fare.fillna(0)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df)
add_features(df_target)

# Modeling with Fast-ai and StratifiedKFold from Sk-learn

In [27]:
# Create splits
splits = RandomSplitter(seed=42)(df)

In [28]:

def run_fold(train_idx, valid_idx, target_df):
    val_score = []
    target_pred = []

    # Create DataLoader
    dls = TabularPandas(df,        
        procs = [Categorify, FillMissing, Normalize],
        cat_names=["Sex","Pclass","Embarked","Deck", "Title"],
        cont_names=['Age', 'SibSp', 'Parch', 'LogFare','Alone', 'TicketFreq', 'Family'],
        y_names="Survived", y_block = CategoryBlock(),
        splits=(list(train_idx),list(valid_idx))
    ).dataloaders(path=".")

    # Create learner. Size of each hidden layer ([100,50])
    learn = tabular_learner(dls, layers=[10,10], metrics=accuracy)

    # Train model
    learn.fit_one_cycle(5,1e-2) # Adjust epochs

    # val_pct_now = learn.validate();
    # val_pct_now2 = learn.validate()[1];
    val_score.append(learn.validate()[1])

    # Get predictions for test set
    test_dl = learn.dls.test_dl(target_df)
    target_pred, _ = learn.get_preds(dl=test_dl)
    target_pred = torch.argmax(target_pred, dim=1)

    
    # Evaluate model - you can return validation loss, accuracy or any other metric
    # return learn.validate()[1], tst_preds#,val_pct, tst_preds
    return val_score, target_pred#,val_pct, tst_preds

In [29]:
# Prepare StratifiedKFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
# Run K-Fold cross-validation loop
val_scores = []
target_preds = []
for train_idx, valid_idx in kf.split(df):
    val_score, target_pred = run_fold(train_idx, valid_idx, df_target)
    val_scores.append(val_score)
    target_preds.append(target_pred)   

epoch,train_loss,valid_loss,accuracy,time
0,0.723385,0.683958,0.513967,00:00
1,0.650155,0.588358,0.776536,00:00
2,0.56992,0.44951,0.815642,00:00
3,0.518946,0.433236,0.821229,00:00
4,0.485573,0.432645,0.815642,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.678824,0.668799,0.674157,00:00
1,0.616647,0.583168,0.730337,00:00
2,0.548992,0.468482,0.792135,00:00
3,0.50316,0.446981,0.797753,00:00
4,0.475559,0.443803,0.797753,00:00


In [17]:
val_score

[0.7094972133636475,
 0.516853928565979,
 0.6966292262077332,
 0.6797752976417542,
 0.6516854166984558]

In [18]:
# Calculate average score
average_score = sum(val_score) / len(val_score)
print(f'Average Accuracy: {average_score}')

Average Accuracy: 0.6508882164955139


# Submit to Kaggle

Prepare the data

In [None]:
df_target = pd.read_csv(path/'test.csv') # Read test file
df_target['Fare'] = df_target.Fare.fillna(0)# There is one Fare missing
add_features(df_target) # Add features to test dataframe
dl_test = learn.dls.test_dl(df_target) # Create DataLoader for test dataframe from the configurations of the learner

Get predictions

In [None]:
preds,_=learn.get_preds(dl=dl_test) # Get predictions

Create submission file

In [None]:
df_target['Survived'] = (preds[:,1]>0.5).int() # Add predictions to dataframe
sub_df_submission = df_target[['PassengerId','Survived']] # Create submission dataframe
sub_df_submission.to_csv('sub2.csv', index=False) # Save submission dataframe to csv file

In [None]:
!head sub.csv

In [None]:
!kaggle competitions submit -c titanic -f sub2.csv -m "Fastai Tabular Learner with 10,10 layers. 16 epochs. 0.03 learning rate. 0.5 threshold."