In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

In [46]:
df = pd.read_csv('travel_insurance.csv')

In [47]:
df = df.select_dtypes(include = ['float64', 'int64']) 
df = df.dropna()

In [48]:
df.head()

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Claim
0,61,19.8,11.88,29,0
1,93,63.0,0.0,36,0
2,22,22.0,0.0,25,0
3,14,54.5,13.63,24,0
4,90,10.0,0.0,23,0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48260 entries, 0 to 48259
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Duration              48260 non-null  int64  
 1   Net Sales             48260 non-null  float64
 2   Commision (in value)  48260 non-null  float64
 3   Age                   48260 non-null  int64  
 4   Claim                 48260 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 2.2 MB


In [51]:
df['Claim'].value_counts()

0    47552
1      708
Name: Claim, dtype: int64

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
X = df.drop('Claim', axis = 1)
y = df['Claim']

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [85]:
y_train.value_counts()

0    31878
1      456
Name: Claim, dtype: int64

## ClusterCentroids

In [86]:
from imblearn.under_sampling import ClusterCentroids
undersampler = ClusterCentroids()

In [87]:
X_smote, y_smote = undersampler.fit_resample(X_train, y_train)

In [88]:
y_smote.value_counts()

1    456
0    456
Name: Claim, dtype: int64

In [97]:
X_smote

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
0,46,22.679381,9.116392e+00,33
1,4843,0.292857,8.285714e-02,48
2,372,215.854878,5.396390e+01,60
3,116,117.000000,3.552714e-15,39
4,21,10.898639,5.672789e-01,25
...,...,...,...,...
907,223,20.000000,0.000000e+00,36
908,365,204.600000,1.329900e+02,35
909,68,148.500000,8.910000e+01,33
910,90,46.000000,0.000000e+00,36


In [101]:
df[(df['Duration'] == 46) & (df['Age'] == 33) & (df['Claim'] == 0)]

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Claim
2583,46,10.0,0.0,33,0
3195,46,0.0,35.64,33,0
7561,46,13.0,0.0,33,0
19499,46,10.0,0.0,33,0
34054,46,17.0,4.25,33,0
35424,46,112.0,0.0,33,0
40058,46,10.0,0.0,33,0
43255,46,64.35,16.09,33,0
43484,46,-26.5,6.63,33,0


In [102]:
X_smote[(X_smote['Duration'] == 46) & (X_smote['Age'] == 33)]

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
0,46,22.679381,9.116392,33


In [103]:
df[(df['Duration'] == 76) & (df['Age'] == 36) & (df['Claim'] == 1)]

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Claim
8110,76,70.0,0.0,36,1


In [104]:
X_smote[(X_smote['Duration'] == 76) & (X_smote['Age'] == 36)]

Unnamed: 0,Duration,Net Sales,Commision (in value),Age
911,76,70.0,0.0,36


## RandomUnderSampler

In [105]:
X = df.drop('Claim', axis = 1)
y = df['Claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

y_train.value_counts()

0    31878
1      456
Name: Claim, dtype: int64

In [106]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy=1) 
                     
X_smote, y_smote = under.fit_resample(X_train, y_train)

In [107]:
y_smote.value_counts()

1    456
0    456
Name: Claim, dtype: int64

In [110]:
y_train.value_counts()

0    31878
1      456
Name: Claim, dtype: int64

## NearMiss

In [112]:
X = df.drop('Claim', axis = 1)
y = df['Claim']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

y_train.value_counts()

0    31878
1      456
Name: Claim, dtype: int64

In [113]:
from imblearn.under_sampling import NearMiss

In [115]:
undersampler = NearMiss(version = 1, n_neighbors = 3) 
                     
X_smote, y_smote = undersampler.fit_resample(X_train, y_train)

In [116]:
y_smote.value_counts()

1    456
0    456
Name: Claim, dtype: int64

In [117]:
y_train.value_counts()

0    31878
1      456
Name: Claim, dtype: int64