In [None]:
import numpy as np
import pandas as pd

import sklearn.ensemble
import sklearn.preprocessing
import sklearn.metrics
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/titanic.csv')
df.head()
df = df.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)

In [None]:
df = df.convert_dtypes()

In [None]:
df.dtypes

Survived      Int64
Pclass        Int64
Sex          string
Age         Float64
SibSp         Int64
Parch         Int64
Fare        Float64
Embarked     string
dtype: object

In [None]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [None]:
# Select the columns
categorical_columns = df.select_dtypes(include = 'string').columns.tolist()

# Operation

# pd.get_dummies(df , prefix= categorical_features)

most_frequent_values1 = df[categorical_columns].mode().iloc[0]
# Fill NA

df[categorical_columns] = df[categorical_columns].fillna(most_frequent_values1)

In [None]:
#categorical

categorical_features = df.select_dtypes(include = "string").columns.tolist()
print(categorical_features)

#one-hot encoding
df = pd.get_dummies(df, prefix = categorical_features)
df

['Sex', 'Embarked']


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0,0,1,0,0,1
887,1,1,19.0,0,0,30.0,1,0,0,0,1
888,0,3,,1,2,23.45,1,0,0,0,1
889,1,1,26.0,0,0,30.0,0,1,1,0,0


In [None]:
# float number
numeric = df.select_dtypes(include = float)
numeric_columns = numeric.columns
numeric_columns

Index(['Age', 'Fare'], dtype='object')

In [None]:
df[numeric_columns] = df[numeric_columns].astype(float)
df[numeric_columns] = df[numeric_columns].interpolate(method = "linear") #fillna is automatically applied
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,0,3,22.5,1,2,23.4500,1,0,0,0,1
889,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [None]:
# find the outliers
from sklearn import ensemble

isoforest = ensemble.IsolationForest(n_estimators = 1000, contamination = 0.01, random_state = 0)

res = isoforest.fit_predict(df.to_numpy())

res

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [None]:
# PCA
pca = sklearn.decomposition.PCA(n_components = 0.9999)

X_pca = pca.fit_transform(df)
X_ori = pca.inverse_transform(X_pca)

anomaly_score = np.abs(df.to_numpy() - X_ori).sum(1)
print(anomaly_score[0])

#get last quantile
threshold = np.quantile(anomaly_score, 0.99)
anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze()

#df = df[~(res == -1)]
#df

df.iloc[anomalous_ids]
df

0.20980894024817953


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,1,19.0,0,0,30.0000,1,0,0,0,1
888,0,3,22.5,1,2,23.4500,1,0,0,0,1
889,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [None]:
# get last quantile

threshold = np.quantile(anomaly_score, 0.99)

anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze()

df.iloc[anomalous_ids]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
301,1,3,29.333333,2,0,23.25,0,1,0,1,0
501,0,3,21.0,0,0,7.75,1,0,0,1,0
502,0,3,29.0,0,0,7.6292,1,0,0,1,0
510,1,3,29.0,0,0,7.75,0,1,0,1,0
654,0,3,18.0,0,0,6.75,1,0,0,1,0
657,0,3,32.0,1,1,15.5,1,0,0,1,0
680,0,3,31.5,0,0,8.1375,1,0,0,1,0
767,0,3,30.5,0,0,7.75,1,0,0,1,0
828,1,3,31.5,0,0,7.75,0,1,0,1,0


In [None]:
x = df[list(set(df.columns) - set(['Survived']))]
y = df['Survived']

In [None]:
scaler = sklearn.preprocessing.StandardScaler()

x = scaler.fit_transform(x)
x

array([[ 0.60791876,  0.44967516,  0.73800982, ..., -0.56131034,
        -0.30262875, -0.73800982],
       [-1.64495664,  0.44967516, -1.35499552, ...,  0.59474687,
        -0.30262875,  1.35499552],
       [ 0.60791876, -0.46841162, -1.35499552, ..., -0.27229604,
        -0.30262875,  1.35499552],
       ...,
       [ 0.60791876,  0.44967516, -1.35499552, ..., -0.52518355,
        -0.30262875,  1.35499552],
       [-1.64495664, -0.46841162,  0.73800982, ..., -0.27229604,
        -0.30262875, -0.73800982],
       [-1.64495664, -0.46841162,  0.73800982, ...,  0.16122542,
         3.30437875, -0.73800982]])

In [None]:
# split in train and test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(705, 10)
(177, 10)


In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_estim = rf.predict(X_test)

In [None]:
conf_mat = sklearn.metrics.confusion_matrix(y_test, y_estim)
conf_mat

array([[95, 21],
       [ 8, 53]])

In [None]:
acc = sklearn.metrics.accuracy_score(y_test, y_estim)
acc

0.8361581920903954