In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import plotly 
import plotly.express as px
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

tqdm.pandas()
plt.style.use('ggplot')

rc = {
    "axes.facecolor":"#FFF9ED",
    "figure.facecolor":"#FFF9ED",
    "axes.edgecolor":"#383838"
}

sns.set(rc=rc)

In [121]:
df_train = pd.read_csv('./train.csv').drop(columns='id')
df_test = pd.read_csv('./test.csv').drop(columns='id')

In [122]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             15304 non-null  object 
 1   age                15304 non-null  float64
 2   hypertension       15304 non-null  int64  
 3   heart_disease      15304 non-null  int64  
 4   ever_married       15304 non-null  object 
 5   work_type          15304 non-null  object 
 6   Residence_type     15304 non-null  object 
 7   avg_glucose_level  15304 non-null  float64
 8   bmi                15304 non-null  float64
 9   smoking_status     15304 non-null  object 
 10  stroke             15304 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 1.3+ MB


In [123]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10204 entries, 0 to 10203
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             10204 non-null  object 
 1   age                10204 non-null  float64
 2   hypertension       10204 non-null  int64  
 3   heart_disease      10204 non-null  int64  
 4   ever_married       10204 non-null  object 
 5   work_type          10204 non-null  object 
 6   Residence_type     10204 non-null  object 
 7   avg_glucose_level  10204 non-null  float64
 8   bmi                10204 non-null  float64
 9   smoking_status     10204 non-null  object 
dtypes: float64(3), int64(2), object(5)
memory usage: 797.3+ KB


In [124]:
for i in df_train.columns:
    print(df_train[i].value_counts())
    print('\n')

Female    9446
Male      5857
Other        1
Name: gender, dtype: int64


57.00    353
78.00    337
53.00    311
31.00    310
45.00    309
        ... 
0.16       6
0.08       6
0.48       3
1.30       2
0.68       1
Name: age, Length: 106, dtype: int64


0    14543
1      761
Name: hypertension, dtype: int64


0    14947
1      357
Name: heart_disease, dtype: int64


Yes    10385
No      4919
Name: ever_married, dtype: int64


Private          9752
children         2038
Self-employed    1939
Govt_job         1533
Never_worked       42
Name: work_type, dtype: int64


Rural    7664
Urban    7640
Name: Residence_type, dtype: int64


93.88     33
85.84     33
77.55     32
72.49     31
73.00     31
          ..
229.92     1
61.12      1
86.66      1
116.14     1
77.65      1
Name: avg_glucose_level, Length: 3740, dtype: int64


23.4    173
28.7    163
28.4    157
26.7    150
27.7    138
       ... 
51.0      1
57.3      1
14.0      1
47.0      1
44.1      1
Name: bmi, Length: 407, dtype: i

## df_train['gender']　の正規化

In [125]:
print(df_train['gender'])

0          Male
1          Male
2        Female
3          Male
4        Female
          ...  
15299    Female
15300    Female
15301    Female
15302      Male
15303    Female
Name: gender, Length: 15304, dtype: object


In [126]:
def gender2int(df):
    df['gender'] = df['gender'].replace('Male', 1)
    df['gender'] = df['gender'].replace('Female', 0)
    df['gender'] = df['gender'].replace('Other', 0)
    df['gender'] = df['gender'].astype(int)
    return df['gender']

In [127]:
df_train['gender'] = gender2int(df_train)

In [128]:
df_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,0,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,1,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,0,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [129]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15304 entries, 0 to 15303
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             15304 non-null  int64  
 1   age                15304 non-null  float64
 2   hypertension       15304 non-null  int64  
 3   heart_disease      15304 non-null  int64  
 4   ever_married       15304 non-null  object 
 5   work_type          15304 non-null  object 
 6   Residence_type     15304 non-null  object 
 7   avg_glucose_level  15304 non-null  float64
 8   bmi                15304 non-null  float64
 9   smoking_status     15304 non-null  object 
 10  stroke             15304 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 1.3+ MB


# EDA

In [130]:
for i in df_train.columns:
    if i != 'stroke':
        fig = px.histogram(df_train, x=i, color="stroke",width=400, height=400)
        fig.show()

In [131]:
df_train = df_train.drop(columns='Residence_type')

In [132]:
df_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28.0,0,0,Yes,Private,79.53,31.1,never smoked,0
1,1,33.0,0,0,Yes,Private,78.44,23.9,formerly smoked,0
2,0,42.0,0,0,Yes,Private,103.0,40.3,Unknown,0
3,1,56.0,0,0,Yes,Private,64.87,28.8,never smoked,0
4,0,24.0,0,0,No,Private,73.36,28.8,never smoked,0


# df_train['ever_married']　の正規化

In [133]:
def ever2int(df):
    df['ever_married'] = df['ever_married'].replace('Yes', 1)
    df['ever_married'] = df['ever_married'].replace('No', 0)
    df['ever_married'] = df['ever_married'].astype(int)
    return df['ever_married']

In [134]:
df_train['ever_married'] = ever2int(df_train)

In [135]:
df_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28.0,0,0,1,Private,79.53,31.1,never smoked,0
1,1,33.0,0,0,1,Private,78.44,23.9,formerly smoked,0
2,0,42.0,0,0,1,Private,103.0,40.3,Unknown,0
3,1,56.0,0,0,1,Private,64.87,28.8,never smoked,0
4,0,24.0,0,0,0,Private,73.36,28.8,never smoked,0


In [136]:
from sklearn.preprocessing import LabelEncoder

In [137]:
le = LabelEncoder()
le.fit(df_train['smoking_status'])
df_train['smoking_status'] = le.transform(df_train['smoking_status'])
le.fit(df_train['work_type'])
df_train['work_type'] = le.transform(df_train['work_type'])
df_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,28.0,0,0,1,2,79.53,31.1,2,0
1,1,33.0,0,0,1,2,78.44,23.9,1,0
2,0,42.0,0,0,1,2,103.0,40.3,0,0
3,1,56.0,0,0,1,2,64.87,28.8,2,0
4,0,24.0,0,0,0,2,73.36,28.8,2,0


In [140]:
df_train['avg_glucose_level'] = (df_train['avg_glucose_level'] - df_train['avg_glucose_level'].mean()) / df_train['avg_glucose_level'].std()
fig = px.histogram(df_train, x='avg_glucose_level', color="stroke",width=400, height=400)
fig.show()

# NN pytorch