# Imports

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from numpy import where

# Preprocessing

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 130)
pd.options.display.float_format = '{:,.3f}'.format

###1. read data
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

print('\n#data.describe')
print(data.describe())
print('\n#corr')
print(data.corr())


#data.describe
                1           2           3           4           5           6           7         8         9        10  \
count 234,929.000 234,929.000 146,554.000 146,874.000 147,782.000 141,285.000 143,020.000 9,927.000 9,926.000 5,847.000   
mean        0.576      65.319      65.213     116.401      97.710      20.745      36.999     0.042     0.388   131.350   
std         0.494      14.277      15.071      30.241      32.837       5.219      10.271     0.103     1.031   104.768   
min         0.000      20.000       0.000       0.000       0.000       0.000       0.000     0.000     0.000     0.100   
25%         0.000      57.000      56.000     101.000      82.000      18.000      36.400     0.010     0.100    46.040   
50%         1.000      67.000      65.000     114.000      96.000      20.000      36.900     0.020     0.200   109.600   
75%         1.000      76.000      74.000     130.000     112.000      24.000      37.400     0.040     0.400   194.100   


In [3]:
###2. patient’s count 
d_patient = data.where((data['timestamp(day)'] == 0) & (data['timestamp(hr)'] == 0))
print('\n#patient count')
print(d_patient['timestamp(day)'].count()) #4156


#patient count
4156


In [4]:
###3. original 결측치(nan) 제거 
data = data.dropna(subset=['original'], how='any', axis=0)
data_test = data_test.dropna(subset=['original'], how='any', axis=0)

In [5]:
###4. imputation 
#ref from https://www.kaggle.com/code/inversion/get-started-with-mean-imputation

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data[:] = imp.fit_transform(data)
data_test[:] = imp.fit_transform(data_test)


In [6]:
###5. replace -4,-5 => -3
data = data.replace(-5, -3)
data = data.replace(-4, -3)

data_test = data_test.replace(-5, -3)
data_test = data_test.replace(-4, -3)

In [7]:
###6. data balancing: under_sampling
#ref from: https://medium.com/grabngoinfo/four-oversampling-and-under-sampling-methods-for-imbalanced-classification-using-python-7304aedf9037

#define dataset for data balancing(under_sampling)
x = data.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
    21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36]] 
y = data['timestamp(day)'] #index=34

rus = RandomUnderSampler(random_state=42)
x, y = rus.fit_resample(x, y)

#test data balancing: under_sampling
x_test = data_test.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
    21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36]] 
y_test = data_test['timestamp(day)']

rus = RandomUnderSampler(random_state=42)
x_test, y_test = rus.fit_resample(x_test, y_test)



# EDA


In [8]:
###7. part 1 answer
print('\n#timestamp(day) mean')
print(x['timestamp(day)'].mean())
print('\n#timestamp(day) median')
print(x['timestamp(day)'].median())
print('\n# x.describe')
print(x.describe())
print('\n# x.corr')
print(x.corr())

corr_with_day = x.corr()[['timestamp(day)']]
print('\n#corr_with_day')
print(corr_with_day)


#timestamp(day) mean
-1.5

#timestamp(day) median
-1.5

# x.describe
                1           2           3           4           5           6           7           8           9          10  \
count 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000 106,192.000   
mean        0.606      65.694      64.385     115.615      96.594      20.622      36.976       0.012       0.116       5.075   
std         0.489      14.025      13.333      31.168      34.780       4.723       2.564       0.021       0.273      29.862   
min         0.000      20.000       0.000       0.000       0.000       0.000       3.700       0.000       0.000       0.200   
25%         0.000      58.000      57.000     104.000      85.000      18.000      36.500       0.010       0.100       0.800   
50%         1.000      69.000      61.000     110.000      91.000      20.000      37.000       0.010       0.100       0.800   
75%         1.000      75.0


#corr_with_day
                timestamp(day)
1                        0.008
2                        0.053
3                       -0.016
4                       -0.061
5                        0.041
6                        0.008
7                        0.011
8                        0.009
9                       -0.000
10                       0.019
11                       0.026
12                       0.022
13                      -0.010
14                      -0.013
15                       0.018
16                       0.017
17                       0.000
18                      -0.004
19                       0.001
20                      -0.005
21                       0.005
22                      -0.002
23                       0.012
24                      -0.004
25                      -0.012
26                      -0.010
27                       0.007
28                       0.008
29                       0.005
30                      -0.003
31                     

# Model Selection



# Model Tuning



# Model Evaluation / Metrics