# Submission 2. Fast-ai

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [1]:
import pandas as pd
from pathlib import Path
import os
from fastai.tabular.all import *
from sklearn.model_selection import KFold

In [2]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/titanic')
    !pip install -Uqq fastai
else:
    import zipfile,kaggle
    path = Path('titanic')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)



In [3]:
pd.options.display.float_format = '{:.2f}'.format
set_seed(42)

In [4]:
# Read train and test files into pandas dataframes
df = pd.read_csv('train.csv')
df_target = pd.read_csv('test.csv')

# Preprocessing


Feature Engineering

In [5]:
def add_features(df):
    df['Fare'] = df.Fare.fillna(0)
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df)
add_features(df_target)

# Modeling with Fast-ai and StratifiedKFold from Sk-learn

In [6]:
# Create splits
splits = RandomSplitter(seed=42)(df)

In [29]:

def run_fold(train_idx, valid_idx, target_df):
    
    # Create DataLoader
    dls = TabularPandas(df,        
        procs = [Categorify, FillMissing, Normalize],
        cat_names=["Sex","Pclass","Embarked","Deck", "Title"],
        cont_names=['Age', 'SibSp', 'Parch', 'LogFare','Alone', 'TicketFreq', 'Family'],
        y_names="Survived", y_block = CategoryBlock(),
        splits=(list(train_idx),list(valid_idx))
    ).dataloaders(path=".")

    # Create learner. Size of each hidden layer ([100,50])
    learn = tabular_learner(dls, layers=[10,10], metrics=accuracy)

    # Train model
    learn.fit_one_cycle(5,1e-2) # Adjust epochs

    # val_pct_now = learn.validate();
    # val_pct_now2 = learn.validate()[1];
    val_score=learn.validate()[1]

    # Get predictions for test set
    target_dl = learn.dls.test_dl(target_df)
    target_pred, _ = learn.get_preds(dl=target_dl)
    target_pred = torch.argmax(target_pred, dim=1)

    
    # Evaluate model - you can return validation loss, accuracy or any other metric
    # return learn.validate()[1], tst_preds#,val_pct, tst_preds
    return val_score, target_pred#,val_pct, tst_preds

In [8]:
# Prepare StratifiedKFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [30]:
# Run K-Fold cross-validation loop
val_scores = []
target_preds = []
for train_idx, valid_idx in kf.split(df):
    val_score, target_pred = run_fold(train_idx, valid_idx, df_target)
    val_scores.append(val_score)
    target_preds.append(target_pred)   

epoch,train_loss,valid_loss,accuracy,time


epoch,train_loss,valid_loss,accuracy,time
0,0.716435,0.661558,0.651685,00:00
1,0.623324,0.571185,0.713483,00:00
2,0.542098,0.466474,0.786517,00:00
3,0.493526,0.450537,0.808989,00:00
4,0.464702,0.439058,0.803371,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.6836,0.671172,0.589888,00:00
1,0.610723,0.52937,0.752809,00:00
2,0.555333,0.418416,0.848315,00:00
3,0.515162,0.409694,0.837079,00:00
4,0.48903,0.403616,0.837079,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.815666,0.66524,0.606742,00:00
1,0.696559,0.598398,0.640449,00:00
2,0.608618,0.518062,0.764045,00:00
3,0.547419,0.488492,0.792135,00:00
4,0.50818,0.483164,0.786517,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.714603,0.667497,0.679775,00:00
1,0.620921,0.531592,0.747191,00:00
2,0.550337,0.430866,0.814607,00:00
3,0.511233,0.407819,0.820225,00:00
4,0.483476,0.405775,0.814607,00:00


In [33]:
# Validation scores
print("Validation Scores: ", val_scores)
print("Average Validation Score: ", sum(val_scores) / len(val_scores))

Validation Scores:  [0.7988826632499695, 0.8033707737922668, 0.8370786309242249, 0.7865168452262878, 0.8146067261695862]
Average Validation Score:  0.8080911278724671


In [36]:
# Convert test_predictions to a DataFrame
test_predictions_df = pd.DataFrame(target_preds).T
test_predictions_df['PassengerId'] = df_target['PassengerId']
test_predictions_df

Unnamed: 0,0,1,2,3,4,PassengerId
0,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),892
1,tensor(1),tensor(0),tensor(0),tensor(0),tensor(0),893
2,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),894
3,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),895
4,tensor(1),tensor(1),tensor(1),tensor(1),tensor(1),896
...,...,...,...,...,...,...
413,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),1305
414,tensor(1),tensor(1),tensor(1),tensor(1),tensor(1),1306
415,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),1307
416,tensor(0),tensor(0),tensor(0),tensor(0),tensor(0),1308


In [38]:
# Aggregate test predictions
final_test_predictions = test_predictions_df.mode(axis=1)[0].astype(int)
submission_df = pd.DataFrame({
    'PassengerId': test_predictions_df['PassengerId'],
    'Survived': final_test_predictions
})

In [39]:
# Save the submission file
submission_df.to_csv('submission.csv', index=False)

Create submission file

In [41]:
!head submission.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1


In [42]:
!kaggle competitions submit -c titanic -f submission.csv -m "Fastai Tabular Learner with 10,10 layers + Kfold CV with SKlearn."

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 4.91kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster