In [1]:
# !kaggle competitions download -c spaceship-titanic

In [2]:
# !unzip spaceship-titanic.zip

In [3]:
import pandas as pd

# 1. Load the training data, how many numerical features do you have?

In [4]:
df_train = pd.read_csv('train.csv')

In [5]:
df_train.dtypes

# 6 float64

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

# 2. How many observations do you have in a training set?

In [6]:
df_train.shape[0]

# 8693 training samples

8693

# 3. Select Age groups with a prevalent number of Transported passengers

In [7]:
import numpy as np

In [8]:
def bin_age(x):
    if x < 20:
        return '0-20'
    elif x < 40:
        return '20-40'
    elif x < 60:
        return '40-60'
    else:
        return '60+'

df_train['Age_cat'] = df_train.Age.apply(bin_age)
# np.histogram(df_train.Age, bins=[0, 20, 40, 60, 200])

In [9]:
df_train.groupby('Age_cat')['Transported'].value_counts()

# 0-20, 40-60

Age_cat  Transported
0-20     True           1271
         False           887
20-40    False          2405
         True           2092
40-60    True            806
         False           799
60+      False           224
         True            209
Name: Transported, dtype: int64

# 4. Which HomePlanet has the highest percentage of Transported passengers?

Recall that this is not the same question as "From which planet most of the Transported passengers had come?"



In [10]:
df_train.groupby('HomePlanet')['Transported'].mean()

# Europe has 66% Transported passengers

HomePlanet
Earth     0.423946
Europa    0.658846
Mars      0.523024
Name: Transported, dtype: float64

# 5. Does CryoSleep increase the chances of being Transported?

In [11]:
df_train.groupby('CryoSleep')['Transported'].mean()

# Yes

CryoSleep
False    0.328921
True     0.817583
Name: Transported, dtype: float64

# 6. What is one-hot-encoding?

In [12]:
pd.get_dummies(df_train.HomePlanet)

# Replace feature with k binary features.

Unnamed: 0,Earth,Europa,Mars
0,0,1,0
1,1,0,0
2,0,1,0
3,0,1,0
4,1,0,0
...,...,...,...
8688,0,1,0
8689,1,0,0
8690,1,0,0
8691,0,1,0


# 7. Construct a simple pipeline.

Impute missing values (for numerical features use 0, except for Age, for Age use mean imputation; for categorical features impute with the most frequent value), Standardize numerical features (zero mean and unit variance), Use KNN with 10 neighbors. 
Use 3-fold cross-validation. What is your average classification accuracy?

In [13]:
X_train = df_train.drop(['Name', 'Cabin', 'Age_cat', 'PassengerId', 'Transported'], axis=1)
y_train = df_train['Transported']

categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [14]:
# Fill NA in categorical features

for col in categorical_features:
    fill_val = X_train[col].mode().values[0]
    X_train[col].fillna(str(fill_val), inplace=True)
    X_train[col] = X_train[col].astype(str)

# to do it completely right one needs to identify the most frequent element inside cross-validation,
# but I would accept this as a correct answer

In [15]:
# Fill NA in numerical features

X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_train[numerical_features] = X_train[numerical_features].fillna(0)

# to do it completely right one needs to compute mean Age inside cross-validation,
# but I would accept this as a correct answer

In [16]:
X_train[numerical_features].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [17]:
# Standardize numerical features

mean = X_train[numerical_features].mean(axis=0)
std = X_train[numerical_features].std(axis=0)
X_train[numerical_features] = (X_train[numerical_features] - mean) / std

# to do it completely right one needs to compute statistics (mean, std) inside cross-validation,
# but I would accept this as a correct answer

In [18]:
# One-hot encode categorical features

dummy_cols = pd.get_dummies(X_train[categorical_features])
X_train.drop(categorical_features, axis=1, inplace=True)

X_train = pd.merge(X_train, dummy_cols, right_index=True, left_index=True)

In [19]:
X_train.head(3)

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,0.709396,-0.333085,-0.281011,-0.283562,-0.27061,-0.262988,0,1,0,1,0,0,0,1,1,0
1,-0.336698,-0.168064,-0.275371,-0.241757,0.217146,-0.224192,1,0,0,1,0,0,0,1,1,0
2,2.034449,-0.267985,1.959885,-0.283562,5.695295,-0.219783,0,1,0,1,0,0,0,1,0,1


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [21]:
model = KNeighborsClassifier(10)

np.round(cross_val_score(model, X_train, y_train, cv=3).mean() * 100)

77.0

# 8. Build a Confusion Matrix of your prediction on test folds. 
What are your specificity and sensitivity? (two integers 0-100, divided by a comma)


In [22]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [23]:
y_pred = cross_val_predict(model, X_train, y_train, cv=3)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

In [24]:
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

In [25]:
np.round(specificity * 100)

79.0

In [26]:
np.round(sensitivity * 100)

76.0

# 9. Try different values of k for KNN: 3, 5, 10, 15, 30, 50, which k yields the best result in terms of classification accuracy?

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid = {
    'n_neighbors': [3,5,10,15,30,50]
}

gs = GridSearchCV(KNeighborsClassifier(), param_grid=grid, cv=3)
gs.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 10, 15, 30, 50]})

In [29]:
gs.best_estimator_

KNeighborsClassifier(n_neighbors=15)

In [30]:
gs.best_score_

0.7836203479972896

# 10. If you are minimizing the number of False Positives, which k should you use?

In [31]:
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

In [32]:
fp_dict = dict()

for k in tqdm(grid['n_neighbors']):
    model = KNeighborsClassifier(k)
    fp_dict[k] = []
    for train, test in KFold(3, shuffle=True, random_state=71).split(X_train, y_train):
        train_X, test_X = X_train.iloc[train], X_train.iloc[test]
        train_y, test_y = y_train[train], y_train[test]
        
        model.fit(train_X, train_y)
        y_pred = model.predict(test_X)
        
        _, fp, _, _ = confusion_matrix(test_y, y_pred).ravel()
        fp_dict[k].append(fp)
    fp_dict[k] = sum(fp_dict[k]) 

  0%|          | 0/6 [00:00<?, ?it/s]

In [33]:
fp_dict

# ~10

{3: 1023, 5: 1035, 10: 893, 15: 1035, 30: 979, 50: 993}

# 11. For the best model in terms of classification accuracy load test data, 
make predictions and submit them on kaggle, does the performance of your model
differ on cross-validation and on the test set, which is better?

In [34]:
df_test = pd.read_csv('test.csv')

In [35]:
X_test = df_test.drop(['Name', 'Cabin', 'PassengerId'], axis=1)

# Preprocess test data in a same way we have preprocessed train data,
# recall that we take all statistics from the train set.
# Typically you want to write a function to preprocess both train and test in a same way.

# Fill NA in categorical features

for col in categorical_features:
    fill_val = df_train[col].mode().values[0]
    X_test[col] = X_test[col].fillna(str(fill_val))
    X_test[col] = X_test[col].astype(str)

# Fill NA in numerical features

X_test['Age'] = X_test['Age'].fillna(df_train['Age'].mean())
X_test[numerical_features] = X_test[numerical_features].fillna(0)

# Standardize numerical features

mean = df_train[numerical_features].mean(axis=0)
std = df_train[numerical_features].std(axis=0)
X_test[numerical_features] = (X_test[numerical_features] - mean) / std

dummy_cols = pd.get_dummies(X_test[categorical_features])
X_test.drop(categorical_features, axis=1, inplace=True)

X_test = pd.merge(X_test, dummy_cols, right_index=True, left_index=True)

columns = X_train.columns
X_test = X_test[columns]

In [36]:
gs.best_estimator_.predict(X_test)

array([ True, False,  True, ...,  True,  True,  True])

In [37]:
df_pred = pd.DataFrame()
df_pred['PassengerId'] = df_test['PassengerId']
df_pred['Transported'] = gs.best_estimator_.predict(X_test)

df_pred.to_csv('pred_knn.csv', index=False)