# Load modules

In [None]:
import sys, os
import gc
import matplotlib.pyplot as plt

In [None]:
gc.collect()

# 不均衡データに対する対処

## under sampling

### データの作成

In [None]:
# ----------------------------------------
# Load modules
# ----------------------------------------
import numpy as np
import pandas as pd

# seed値の設定
np.random.seed(57)


# ----------------------------------------
# sample dataを作成
# ----------------------------------------
# sample size
# 負例
y0_n_samples = 950

# 正例
y1_n_samples = 50

# データ列を作成
x0_arr = np.random.normal(0, 5, y0_n_samples)
x1_arr = np.random.normal(5, 3, y1_n_samples)
y0_arr = [0] * y0_n_samples
y1_arr = [1] * y1_n_samples

# 負例データ
df0 = pd.DataFrame({'x': x0_arr, 'y': y0_arr})

# 正例データ
df1 = pd.DataFrame({'x': x1_arr, 'y': y1_arr})

# dataを結合
before_df = pd.concat(
    [df0, df1],
    axis=0
).sample(frac=1).reset_index(drop=True)

### under samplingの実行

In [None]:
print('*' * 20)
print('before')

# shapeを確認
print('shape:', before_df.shape)

# yの比率を確認
print('yの比率:')
print(before_df['y'].value_counts())
print('*' * 20 + '\n')

# under samplingを実行
# 比率
frac = len(df1) / len(df0)
print('frac:', frac)
df0 = df0.sample(frac=frac)
after_df = pd.concat([df0, df1], axis=0).reset_index(drop=True)

print('*' * 20)
print('after')

# shapeを確認
print('shape:', after_df.shape)

# yの比率を確認
print('yの比率:')
print(after_df['y'].value_counts())
print('*' * 20)

## over sampling

### データの作成

In [None]:
# ----------------------------------------
# Load modules
# ----------------------------------------
import numpy as np
import pandas as pd

# over samplingライブラリ
from imblearn.over_sampling import SMOTE

# seed値の設定
np.random.seed(57)


# ----------------------------------------
# sample dataを作成
# ----------------------------------------
# sample size
# 負例
y0_n_samples = 950

# 正例
y1_n_samples = 50

# データ列を作成
x0_arr = np.random.normal(0, 5, y0_n_samples)
x1_arr = np.random.normal(5, 3, y1_n_samples)
y0_arr = [0] * y0_n_samples
y1_arr = [1] * y1_n_samples

# 負例データ
df0 = pd.DataFrame({'x': x0_arr, 'y': y0_arr})

# 正例データ
df1 = pd.DataFrame({'x': x1_arr, 'y': y1_arr})

# dataを結合
before_df = pd.concat(
    [df0, df1],
    axis=0
).sample(frac=1).reset_index(drop=True)

### 説明変数分布の確認

In [None]:
before_df['x'].hist(bins=20)

In [None]:
df0['x'].hist(bins=10)

In [None]:
df1['x'].hist(bins=10)

### over samplingの実行

In [None]:
# ----------------------------------------
# Load modules
# ----------------------------------------
import numpy as np
import pandas as pd

# over samplingライブラリ
from imblearn.over_sampling import SMOTE

In [None]:
print('*' * 20)
print('before')

# shapeを確認
print('shape:', before_df.shape)

# yの比率を確認
print('yの比率:')
print(before_df['y'].value_counts())
print('*' * 20 + '\n')

# under samplingを実行
# 比率
frac = len(df1) / len(df0)
print('frac:', frac)
sm = SMOTE(random_state=57)
after_x_arr, after_y_arr = sm.fit_resample(
    before_df[['x']],
    before_df['y']
)
after_df = pd.DataFrame({
    'x': after_x_arr.values.T[0],
    'y': after_y_arr
})

print('*' * 20)
print('after')

# shapeを確認
print('shape:', after_df.shape)

# yの比率を確認
print('yの比率:')
print(after_df['y'].value_counts())
print('*' * 20)

### 説明変数分布の確認

In [None]:
after_df0 = after_df[after_df['y']==0]
after_df1 = after_df[after_df['y']==1]

In [None]:
after_df['x'].hist(bins=20)

In [None]:
after_df0['x'].hist(bins=10)

In [None]:
after_df1['x'].hist(bins=10)