## Classification
## Example: Predict survival on Titanic

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Temporarily Suppressing Warnings
import warnings
warnings.filterwarnings("ignore")

## Working with data

In [3]:
# https://www.kaggle.com/c/titanic/data
train = pd.read_csv('../titanic/train.csv')
test = pd.read_csv('../titanic/test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Pandas object data type is used for text or mixed numeric and non-numeric values.
https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-data-types

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [6]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### We'll need test_pas_id for submission dataframe

In [7]:
test_pas_id = test['PassengerId']

### Make a list from train and test

In [8]:
full_data=[train, test]

In [9]:
type(full_data)

list

### Impute missing values

#### Embarked

In [10]:
train[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [11]:
train[train['Fare']==80.00]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [12]:
train[(train['Fare']>79) & (train['Fare']<81) & (train['Pclass']==1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
139,140,0,1,"Giglio, Mr. Victor",male,24.0,0,0,PC 17593,79.2,B86,C
256,257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
262,263,0,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E67,S
558,559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0,1,1,110413,79.65,E67,S
585,586,1,1,"Taussig, Miss. Ruth",female,18.0,0,2,110413,79.65,E68,S
587,588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60.0,1,1,13567,79.2,B41,C
789,790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [13]:
train[(train['Fare']>79) & (train['Fare']<81) & (train['Pclass']==1)].groupby('Embarked').size()

Embarked
C    4
S    3
dtype: int64

In [14]:
train['Embarked'] = train['Embarked'].fillna('C')

#### Fare

In [15]:
fare = np.concatenate((train['Fare'], test['Fare']))
test['Fare'] = test['Fare'].fillna(np.nanmedian(fare))

####  Age

In [None]:
# We have plenty of missing values in this feature. 
# Generate random numbers between (mean - std) and (mean + std). 

In [16]:
train['Age'].isnull().sum()

177

In [17]:
age = np.concatenate((train['Age'], test['Age']))
age_avg = np.nanmean(age)
age_std = np.nanstd(age)

In [18]:
print(age_avg)
print(age_std)

29.881137667304014
14.406601748667825


In [19]:
np.random.seed(0)
for dataset in full_data:
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[dataset['Age'].isnull(), 'Age'] = age_null_random_list 

In [20]:
train['Age'].isnull().sum()

0

In [21]:
test['Age'].isnull().sum()

0

Sklearn functions for imputing https://scikit-learn.org/stable/auto_examples/impute/plot_missing_values.html?highlight=miss#

### Data preprocessing

In [None]:
# Encoding categorical features
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features

In [None]:
# OneHotEncoder transforms each categorical feature with n_categories possible values into n_categories binary features, 
# with one of them 1, and all others 0.

# If there is a possibility that the training data might have missing categorical features, 
# it can often be better to specify handle_unknown='ignore' (the resulting one-hot encoded columns for unknown category will be all zeros). 

In [None]:
# It is also possible to encode each column into n_categories - 1 columns instead of n_categories columns 
# by using the drop parameter (e.g. drop='if_binary' etc.).
# This is useful to avoid co-linearity in the input matrix in some classifiers. 
# Such functionality is useful, for example, when using non-regularized regression (LinearRegression),
# since co-linearity would cause the covariance matrix to be non-invertible.

In [22]:
factors_train = train[['Sex','Pclass','Embarked']]
factors_test = test[['Sex','Pclass','Embarked']]

In [23]:
factors_train

Unnamed: 0,Sex,Pclass,Embarked
0,male,3,S
1,female,1,C
2,female,3,S
3,female,1,S
4,male,3,S
...,...,...,...
886,male,2,S
887,female,1,S
888,female,3,S
889,male,1,C


In [24]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc.fit(factors_train)
enc.categories_

[array(['female', 'male'], dtype=object),
 array([1, 2, 3], dtype=int64),
 array(['C', 'Q', 'S'], dtype=object)]

In [25]:
tfactors_train = enc.transform(factors_train).toarray()
tfactors_train[:5,]

array([[0, 1, 0, 0, 1, 0, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 1]])

In [26]:
tfactors_test = enc.transform(factors_test).toarray()

In [27]:
train_pclass_emb = pd.DataFrame(tfactors_train, columns = ('Female', 'Male','Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'))

In [28]:
train_pclass_emb.head()

Unnamed: 0,Female,Male,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S
0,0,1,0,0,1,0,0,1
1,1,0,1,0,0,1,0,0
2,1,0,0,0,1,0,0,1
3,1,0,1,0,0,0,0,1
4,0,1,0,0,1,0,0,1


In [29]:
test_pclass_emb = pd.DataFrame(tfactors_test, columns = ('Female', 'Male','Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'))

### Feature Selection

In [30]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Pclass', 'Embarked']
train = train.drop(drop_elements, axis = 1)
test = test.drop(drop_elements, axis = 1)

In [31]:
train = pd.concat([train,train_pclass_emb], axis=1)
test = pd.concat([test,test_pclass_emb], axis=1)

In [32]:
train.columns

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Female', 'Male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Age       891 non-null    float64
 2   SibSp     891 non-null    int64  
 3   Parch     891 non-null    int64  
 4   Fare      891 non-null    float64
 5   Female    891 non-null    int32  
 6   Male      891 non-null    int32  
 7   Pclass_1  891 non-null    int32  
 8   Pclass_2  891 non-null    int32  
 9   Pclass_3  891 non-null    int32  
 10  Emb_C     891 non-null    int32  
 11  Emb_Q     891 non-null    int32  
 12  Emb_S     891 non-null    int32  
dtypes: float64(2), int32(8), int64(3)
memory usage: 62.8 KB


In [34]:
trainv = train.values

In [35]:
trainv.shape

(891, 13)

In [36]:
type(trainv)

numpy.ndarray

In [37]:
trainv.dtype

dtype('float64')

In [38]:
X = trainv[0:, 1:]
y = trainv[0:, 0]

In [39]:
X.shape

(891, 12)

In [40]:
np.unique(y, return_counts=True)

(array([0., 1.]), array([549, 342], dtype=int64))

In [41]:
549/891

0.6161616161616161

In [42]:
from sklearn.model_selection import train_test_split
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2, stratify=y)
print(Xtrain.shape, Xtest.shape)

(668, 12) (223, 12)


In [43]:
np.unique(ytrain, return_counts = True)

(array([0., 1.]), array([412, 256], dtype=int64))

In [44]:
412/668

0.6167664670658682

In [45]:
np.unique(ytest, return_counts = True)

(array([0., 1.]), array([137,  86], dtype=int64))

In [46]:
137/223

0.6143497757847534

### Data prepared to predict for submission

In [47]:
test.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Female', 'Male', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [48]:
Xnew = test.values
Xnew.shape

(418, 12)

## Scaling

In [49]:
import sklearn
sklearn.__version__

'0.23.2'

On the one hand, in sklearn 0.23.2 LogisticRegression() uses solver 'lbfgs' by default, and data preprocessing with a scaler is recommended for fast convergence for 'sag' and 'saga' solvers https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression. 

Decision tree-based estimators are robust to arbitrary scaling of the data.

Conclusion - no need for scaling in this script, but one can try it as an option and compare results for scaled and unscaled data.

In [None]:
# # Standardize features by removing the mean and scaling to unit variance
# # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# # http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(Xtrain)
# Xtrain = scaler.transform(Xtrain)
# Xtest = scaler.transform(Xtest)
# Xnew = scaler.transform(Xnew)

## Modeling

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

## LogisticRegression

In [None]:
# http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [51]:
# Model
model_lgr = LogisticRegression(n_jobs = -1)
print(model_lgr)

LogisticRegression(n_jobs=-1)


In [None]:
# C - Inverse of regularization strength; must be a positive float.
# Smaller values specify stronger regularization.

### Fit the model

In [52]:
model_lgr.fit(Xtrain, ytrain)

LogisticRegression(n_jobs=-1)

### Model fit parameters

In [53]:
model_lgr.coef_

array([[-0.03200008, -0.22913404, -0.10711079,  0.00329675,  1.6110418 ,
        -1.10956244,  1.0644184 ,  0.3580632 , -0.92100224,  0.32129491,
         0.21633158, -0.03614713]])

In [54]:
model_lgr.coef_.reshape(12,)

array([-0.03200008, -0.22913404, -0.10711079,  0.00329675,  1.6110418 ,
       -1.10956244,  1.0644184 ,  0.3580632 , -0.92100224,  0.32129491,
        0.21633158, -0.03614713])

In [55]:
params = pd.Series(model_lgr.coef_.reshape(12,), index=train.columns[1:])
params

Age        -0.032000
SibSp      -0.229134
Parch      -0.107111
Fare        0.003297
Female      1.611042
Male       -1.109562
Pclass_1    1.064418
Pclass_2    0.358063
Pclass_3   -0.921002
Emb_C       0.321295
Emb_Q       0.216332
Emb_S      -0.036147
dtype: float64

In [56]:
model_lgr.intercept_

array([0.63144529])

### Model validation

In [57]:
# Predict on train

ypred_train = model_lgr.predict(Xtrain)
ypred_train_proba = model_lgr.predict_proba(Xtrain)

In [58]:
# Predict on test

ypred = model_lgr.predict(Xtest)
print(ypred[:10])

ypred_proba = model_lgr.predict_proba(Xtest)
print(ypred_proba[:5,:])

# ypred_proba[:,0] - probability for class zero (not survived), 
# ypred_proba[:,1] - probability for class one - survived

[0. 0. 0. 1. 0. 1. 1. 1. 0. 1.]
[[0.71368425 0.28631575]
 [0.84474859 0.15525141]
 [0.90132431 0.09867569]
 [0.48283003 0.51716997]
 [0.93994431 0.06005569]]


#### Metrics: accuracy, confusion matrix, classification report, AUC
#### http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics

In [59]:
# We can check our classification accuracy by comparing 
# the true values of the test set to the predictions:

In [60]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

0.8023952095808383

In [61]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.7937219730941704

In [62]:
# Score for classification models is accuracy
model_lgr.score(Xtest, ytest)

0.7937219730941704

In [None]:
# Accuracy doesn't tell us where we've gone wrong: 
# one nice way to do this is to use the confusion matrix
# 

In [63]:
print(confusion_matrix(ytest, ypred))

[[113  24]
 [ 22  64]]


In [64]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
target_names = ['not survived', 'survived']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

not survived       0.84      0.82      0.83       137
    survived       0.73      0.74      0.74        86

    accuracy                           0.79       223
   macro avg       0.78      0.78      0.78       223
weighted avg       0.79      0.79      0.79       223



In [None]:
# Macro average (averaging the unweighted mean per label), 
# weighted average (averaging the support-weighted mean per label).

In [65]:
# AUC
# ypred_train_proba[:, 1], ypred_proba[:, 1] - probability estimates of the positive class

print("AUC on train =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on train = 0.8601003109830097
AUC on test = 0.8351722967238161


In [66]:
# Mean cross-validation accuracy
# http://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lgr, Xtrain, ytrain, cv=5)
scores.mean()

0.7904836718662327

### Hyperparameters Grid Search

In [67]:
# http://scikit-learn.org/stable/modules/grid_search.html#grid-search

# GridSearchCV exhaustively considers all parameter combinations

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [.001, .01, 1, 10]}
lgr = LogisticRegression(random_state=1)
grid = GridSearchCV(lgr, param_grid, cv=5, n_jobs=-1)

In [68]:
grid.fit(Xtrain, ytrain)

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=1), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 1, 10]})

In [69]:
# Mean cross-validated score of the best_estimator
grid.best_score_

0.7904836718662327

In [70]:
grid.best_params_

{'C': 1}

In [71]:
model = grid.best_estimator_

In [72]:
print(model)

LogisticRegression(C=1, random_state=1)


In [73]:
model.score(Xtest,ytest)

0.7937219730941704

In [74]:
model.score(Xtrain,ytrain)

0.8023952095808383

### Save / load a model

In [75]:
# https://scikit-learn.org/stable/modules/model_persistence.html
# https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
import joblib
joblib.dump(model, 'model.joblib') 

['model.joblib']

In [76]:
model1 = joblib.load('model.joblib') 

In [77]:
print(model1)

LogisticRegression(C=1, random_state=1)


In [78]:
ypred = model1.predict(Xtest)
ypred[:10]

array([0., 0., 0., 1., 0., 1., 1., 1., 0., 1.])

## RandomForestClassifier

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

In [None]:
# rfc = RandomForestClassifier(random_state = ...)

In [None]:
# Fit the model


In [None]:
# Model accuracy on train and on test data


In [None]:
# Make a conclusion for obtained results: underfitting, overfitting or appropriate fitting 

In [None]:
# Features importance
# pd.Series(rfc.feature_importances_, index=train.columns[1:])

In [None]:
# Predict on test

In [None]:
# Print classification report

In [None]:
# Mean cross-validation accuracy

#### Hyperparameters Grid Search

In [None]:
# What is our aim in hyperparameters grid search? 
# What hyperparameters can we take for grid search to reach our aim?

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {...}
# grid = GridSearchCV(RandomForestClassifier(random_state = 2), param_grid, cv=5)

In [None]:
# Fit grid


In [None]:
# Mean cross-validated score of the best_estimator


In [None]:
# Parameter max_depth of the best_estimator


In [None]:
# Accuracy of the best_estimator on train and test

### GradientBoostingClassifier

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Continue on analogue with RandomForestClassifier

### Submission

In [None]:
# Generate Submission File 

# Use model with the best accuracy on test to predict on Xnew (ypred_Xnew should be int)

# Example: ypred_Xnew = model_lgr.predict(Xnew).astype(int)

# submission = pd.DataFrame({ 'PassengerId': test_pas_id,
#                             'Survived': ypred_Xnew })
# submission.to_csv("submission.csv", index=False)

1) Register on https://www.kaggle.com
2) Go to https://www.kaggle.com/c/titanic/submit
3) Submit your csv file and get the score (accuracy)