In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Adapted from https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/over-sampling/plot_comparison_over_sampling.html
# Original Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
# Modifiied by: Mike Bopf

from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

from sklearn.svm import LinearSVC

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import BaseSampler

In [3]:
%%time
import util_mwb
#df = util_mwb.subset_csv('../../data/csl/CSL_NearMiss.csv', rows=5000, columns=100, types_file='../csl_types.pickle')
#df = util_mwb.subset_csv('../../data/csl/CSL_NearMiss.csv', rows=200000, columns=244, types_file='../csl_types.pickle')
df = util_mwb.read_csl_csv('../../data/csl/CSL_NearMiss.csv', types_file='../csl_types.pickle')

CPU times: user 56.2 s, sys: 898 ms, total: 57.1 s
Wall time: 58 s


In [7]:
df.head()
df['momage']

KeyError: 'momage'

In [116]:
df.shape

(228438, 244)

In [117]:
X = df.iloc[:,5:]
y = df['MomNearMiss'].values
print(X.shape)
print(y.shape)
print(type(y))

(228438, 239)
(228438,)
<class 'numpy.ndarray'>


In [118]:
# y must have multiple classes or SMOTE will error out
pd.Series(y).value_counts()

1    227428
2      1010
dtype: int64

In [119]:
#TEMP fix to NaN issue
if 'depression9' in X.columns:
    X.drop('depression9', axis=1, inplace=True)
if 'heart_disease9' in X.columns:
    X.drop('heart_disease9', axis=1, inplace=True)


In [120]:
#cat_cols = list(X.loc[:, (df.dtypes != np.float) & (df.dtypes != np.object)].index)
#X.loc[:, (df.dtypes != np.float) & (df.dtypes != np.object)]
cat_cols = list(((X.dtypes != np.float) & (X.dtypes != np.object)).values) # <class 'list>'
#cat_cols = [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
print(len(cat_cols))
print(type(cat_cols))
print(X.shape)

237
<class 'list'>
(228438, 237)


In [121]:
# TEMP fix for "features" issue - not why this is necessary. Still get the error occasionally even with this "fix"
# "ValueError: Found array with 0 feature(s) (shape=(5000, 0)) while a minimum of 1 is required"

# ISSUE: I believe this error is because SMOTENC is designed for Categorical AND Numeric data, and likely 
# doesn't work well for solely categorical. See this thread on the scikit-learn development website: 
# https://github.com/scikit-learn-contrib/imbalanced-learn/issues/33 -- on Nov. 13, 2018:
#
# -- lisiqi commented on Nov 13, 2018
# Hi, is it possible to use SMOTENC for only categorical features, within which there are many categorical values?
# -- glemaitre commented on Nov 13, 2018
# Nop. SMOTE-NC is for both categorical and numerical. I think that it should # be another variant for SMOTE to handle solely categorical.

cat_cols[0] = False
#cat_cols

In [122]:
# Verify no columns with null values
for col in X:
    if X[col].isnull().values.any():
    #if X[col].isna().sum() > 0:
        print(col)

In [123]:
%%time
smote_nc = SMOTENC(categorical_features=cat_cols, random_state=0)
X_res, y_res = smote_nc.fit_resample(X, y)
# For 228438 x 244: CPU times: user 1h 59min 16s, sys: 1min 3s, total: 2h 20s; Wall time: 1h 53min 34s
# For 100000 x 244: CPU times: user 48min 30s, sys: 19.5 s, total: 48min 50s; Wall time: 47min 46s
# For 50000 x 244: CPU times: user 24min, sys: 19.4 s, total: 24min 19s; Wall time: 24min 1s
# For 20000 x 95: CPU times: user 4min 32s, sys: 2.49 s, total: 4min 35s; Wall time: 4min 30s
# For 5000 x 93: CPU times: user 1min 13s, sys: 648 ms, total: 1min 14s; Wall time: 1min 10s 

CPU times: user 1h 59min 16s, sys: 1min 3s, total: 2h 20s
Wall time: 1h 53min 34s


In [124]:
print(X.shape)
print(y.shape)

(228438, 237)
(228438,)


In [125]:
print(X_res.shape)
print(y_res.shape)

(454856, 237)
(454856,)


In [142]:
y_res_s = pd.Series(y_res)
xind = X.columns
X_res_df = pd.DataFrame(X_res, columns=xind)

In [149]:
y_res_s.value_counts()

2    227428
1    227428
dtype: int64

In [144]:
X_res_df.head()

Unnamed: 0,Activeherpes,Alcohol,Anteabruption,Anteanemia,Anteasthma,Antebleed3,AnteCHBP,Antechorio,Antefetaldth,Antefetdistress,...,pulmonary_embolism9,renal_disease9,renal_disease_comb,spontlabor,threat_new,threatpb9,uscar,version9,vertex,vertex_new
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,2,1,1,1,1,1,1,...,1,1,1,2,1,1,2,1,9,9
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [145]:
counts = X_res_df.apply(lambda x: x.value_counts()).T.stack()

In [146]:
pd.options.display.max_rows = None
counts

Activeherpes            1.000000     453496.0
                        1.003789          1.0
                        1.006219          1.0
                        1.006424          1.0
                        1.008552          1.0
                        1.016240          1.0
                        1.017242          1.0
                        1.017662          1.0
                        1.021536          1.0
                        1.024071          1.0
                        1.025125          1.0
                        1.025251          1.0
                        1.027827          1.0
                        1.039718          1.0
                        1.057565          1.0
                        1.058198          1.0
                        1.061714          1.0
                        1.064443          1.0
                        1.069689          1.0
                        1.070481          1.0
                        1.072547          1.0
                        1.077062  

In [147]:
X_res_df.to_csv('../../data/csl/smote_Xresult.csv')

In [148]:
y_res_s.to_csv('../../data/csl/smote_yresult.csv')

  """Entry point for launching an IPython kernel.


In [151]:
y_res_s.name = 'MomNearMiss'

In [153]:
y_res_s.head()

0    1
1    1
2    1
3    1
4    1
Name: MomNearMiss, dtype: int64

In [155]:
X_res_df['MomNearMiss'] = y_res_s

In [159]:
X_res_df.head()

Unnamed: 0,Activeherpes,Alcohol,Anteabruption,Anteanemia,Anteasthma,Antebleed3,AnteCHBP,Antechorio,Antefetaldth,Antefetdistress,...,renal_disease9,renal_disease_comb,spontlabor,threat_new,threatpb9,uscar,version9,vertex,vertex_new,MomNearMiss
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,2,1,1,1,1,1,1,...,1,1,2,1,1,2,1,9,9,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
