## 1. import

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager,rc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## 2. Pre processing

In [2]:
patient = pd.read_csv('C:/Users/PJH/OneDrive/Python/Example/covid19/Covid19_dataset/Patientinfo.csv')
patient = patient.copy()

In [3]:
patient.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5.0,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31,,2020-01-30,2020-03-02,,released
2,1000000003,6.0,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1.0,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,9.0,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2.0,1000000002.0,2,,2020-01-31,2020-02-24,,released


### 가. 확진자 유형 그룹핑

In [4]:
# 감염유형별로 분류 후 patient_id 로 counting하여 유형별 집계
g_patient = patient.groupby(['infection_case'])['patient_id'].count()
g_patient = pd.DataFrame(g_patient)
g_patient.head()

Unnamed: 0_level_0,patient_id
infection_case,Unnamed: 1_level_1
Bonghwa Pureun Nursing Home,31
Changnyeong Coin Karaoke,4
Cheongdo Daenam Hospital,21
Coupang Logistics Center,70
Dongan Church,17


In [5]:
# 집계 수치 10 이상 해당 인덱스 추출
morethan10 = g_patient[g_patient['patient_id']>10].index.tolist()
morethan10

['Bonghwa Pureun Nursing Home',
 'Cheongdo Daenam Hospital',
 'Coupang Logistics Center',
 'Dongan Church',
 "Eunpyeong St. Mary's Hospital",
 'Guro-gu Call Center',
 'Gyeongsan Jeil Silver Town',
 'Gyeongsan Seorin Nursing Home',
 'Itaewon Clubs',
 'Milal Shelter',
 'Ministry of Oceans and Fisheries',
 'Onchun Church',
 'Seongdong-gu APT',
 'Shincheonji Church',
 'contact with patient',
 'etc',
 'gym facility in Cheonan',
 'overseas inflow']

In [6]:
# 집계 수치 10 이상 인덱스 데이터 생성
patient_group = patient[patient['infection_case'].isin(morethan10)]
patient_group.shape

(3158, 18)

In [7]:
# 확진자 유형별 재분류 그룹핑
def grouping(x):
    if x == 'overseas inflow':
        y = 'overseas'
    elif x == 'contact with patient':
        y = 'individual'
    elif (x == 'etc')|(pd.isna(x)):
        y = 'unknown'
    else:
        y = 'group'
    return y

In [8]:
patient_group['infection_category'] = patient['infection_case'].apply(lambda x: grouping(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
print(patient_group.shape)
patient_group.head()

(3158, 19)


Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,infection_category
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75,2020-01-22,2020-01-23,2020-02-05,,released,overseas
1,1000000002,5.0,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31,,2020-01-30,2020-03-02,,released,overseas
2,1000000003,6.0,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000001.0,17,,2020-01-30,2020-02-19,,released,individual
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1.0,,9,2020-01-26,2020-01-30,2020-02-15,,released,overseas
4,1000000005,9.0,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2.0,1000000002.0,2,,2020-01-31,2020-02-24,,released,individual


### 나. 유동성 유형 그룹핑

In [10]:
route = pd.read_csv('C:/Users/PJH/OneDrive/Python/Example/covid19/Covid19_dataset/PatientRoute.csv')

In [11]:
route.head()

Unnamed: 0,patient_id,global_num,date,province,city,type,latitude,longitude
0,1000000001,2.0,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632
1,1000000001,2.0,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659
2,1000000002,5.0,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048
3,1000000002,5.0,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822
4,1000000002,5.0,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534


In [12]:
#이동경로 유형
route['type'].unique()

array(['airport', 'hospital', 'etc', 'store', 'public_transportation',
       'restaurant', 'church', 'beauty_salon', 'pharmacy', 'pc_cafe',
       'bank', 'academy', 'cafe', 'bakery', 'bar', 'gym', 'school',
       'real_estate_agency', 'karaoke', 'post_office', 'gas_station',
       'park', 'lodging', 'university', 'administrative_area_level_1'],
      dtype=object)

In [13]:
#이동경로 결측치 수 확인
route['type'].isna().sum()

0

In [14]:
# 필수이동 유형 추출 및 그룹핑
necessary = ['school', 'university', 'administrative_area_level_1', 
             'governmnet_office', 'post_office',
             'hospital', 'pharmacy']

In [15]:
# 이동 유형(필수, 부수) 분류
def group_necessary(x):
    if x in necessary:
        y = 'necessary'
    else:
        y = 'un_necessary'
    return y

route['necessary_group'] = route['type'].apply(lambda x : group_necessary(x))
route.head()

Unnamed: 0,patient_id,global_num,date,province,city,type,latitude,longitude,necessary_group
0,1000000001,2.0,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632,un_necessary
1,1000000001,2.0,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659,necessary
2,1000000002,5.0,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048,un_necessary
3,1000000002,5.0,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822,un_necessary
4,1000000002,5.0,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534,un_necessary


In [16]:
route['value'] = 0
route_group = route.groupby(['patient_id', 'necessary_group'])['value'].count().unstack()
route_group = pd.DataFrame(route_group)

In [17]:
route_group.fillna(0, inplace = True)
route_group.reset_index(inplace = True)
route_group = route_group[['patient_id', 'necessary', 'un_necessary']]
route_group

necessary_group,patient_id,necessary,un_necessary
0,1000000001,1.0,1.0
1,1000000002,2.0,11.0
2,1000000003,0.0,2.0
3,1000000004,1.0,0.0
4,1000000005,1.0,0.0
...,...,...,...
1467,6100000086,2.0,2.0
1468,6100000087,3.0,5.0
1469,6100000088,5.0,19.0
1470,6100000089,1.0,1.0


In [18]:
data = pd.merge(patient_group, route_group, left_on = 'patient_id', right_on = 'patient_id', how = 'left')

In [19]:
data.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,...,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,infection_category,necessary,un_necessary
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,...,,75,2020-01-22,2020-01-23,2020-02-05,,released,overseas,1.0,1.0
1,1000000002,5.0,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,...,,31,,2020-01-30,2020-03-02,,released,overseas,2.0,11.0
2,1000000003,6.0,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,...,2002000001.0,17,,2020-01-30,2020-02-19,,released,individual,0.0,2.0
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,...,,9,2020-01-26,2020-01-30,2020-02-15,,released,overseas,1.0,0.0
4,1000000005,9.0,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,...,1000000002.0,2,,2020-01-31,2020-02-24,,released,individual,1.0,0.0


In [20]:
#필수 or 부수적 이동자 결측치 확인
data[(data['necessary'].isna()) | (data['un_necessary'].isna())].shape

(1815, 21)

In [21]:
#필수 or 부수적 이동자 결측치 제거
data = data[(data['necessary'].notna()) & (data['un_necessary'].notna()) ]

In [22]:
data

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,...,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,infection_category,necessary,un_necessary
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,...,,75,2020-01-22,2020-01-23,2020-02-05,,released,overseas,1.0,1.0
1,1000000002,5.0,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,...,,31,,2020-01-30,2020-03-02,,released,overseas,2.0,11.0
2,1000000003,6.0,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,...,2002000001,17,,2020-01-30,2020-02-19,,released,individual,0.0,2.0
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,...,,9,2020-01-26,2020-01-30,2020-02-15,,released,overseas,1.0,0.0
4,1000000005,9.0,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,...,1000000002,2,,2020-01-31,2020-02-24,,released,individual,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107,6100000086,,male,1966,50s,Korea,Gyeongsangnam-do,Changwon-si,,etc,...,,,2020-03-15,2020-03-19,,,isolated,unknown,2.0,2.0
3108,6100000087,,female,1967,50s,Korea,Gyeongsangnam-do,Changwon-si,,etc,...,,,2020-03-16,2020-03-21,,,isolated,unknown,3.0,5.0
3109,6100000088,,male,1994,20s,Korea,Gyeongsangnam-do,Changwon-si,,etc,...,,,2020-03-06,2020-03-22,2020-04-19,,released,unknown,5.0,19.0
3110,6100000089,,male,1960,60s,Korea,Gyeongsangnam-do,Haman-gun,,etc,...,6002000035,,2020-03-22,2020-03-23,,,released,unknown,1.0,1.0


### 다. 접촉 빈도 유형 그룹핑

In [23]:
data['contact_number'].describe()

count     334
unique     66
top         0
freq       44
Name: contact_number, dtype: object

In [24]:
#결측치 제거
data = data[data['contact_number'].notna()]

In [25]:
data.shape

(334, 21)

In [26]:
data.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,...,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,infection_category,necessary,un_necessary
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,...,,75,2020-01-22,2020-01-23,2020-02-05,,released,overseas,1.0,1.0
1,1000000002,5.0,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,...,,31,,2020-01-30,2020-03-02,,released,overseas,2.0,11.0
2,1000000003,6.0,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,...,2002000001.0,17,,2020-01-30,2020-02-19,,released,individual,0.0,2.0
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,...,,9,2020-01-26,2020-01-30,2020-02-15,,released,overseas,1.0,0.0
4,1000000005,9.0,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,...,1000000002.0,2,,2020-01-31,2020-02-24,,released,individual,1.0,0.0


### 라. 증상발현 ~ 확진(diagnosis time)유형 분류

In [27]:
#결측치 확인
print("symptom_onset_date 결측 데이터 수:", data['symptom_onset_date'].isna().sum())
print("confirmed_date 결측 데이터 수:" , data['confirmed_date'].isna().sum())

symptom_onset_date 결측 데이터 수: 204
confirmed_date 결측 데이터 수: 0


In [28]:

data = data[data['symptom_onset_date'].notna()]

In [29]:
data.shape

(130, 21)

In [30]:
##날짜 데이터 포맷 변환
data['symptom_onset_date'] = pd.to_datetime(data['symptom_onset_date'])
data['confirmed_date'] = pd.to_datetime(data['confirmed_date'])

In [31]:
data['interval_date'] = data['confirmed_date'] - data['symptom_onset_date']

In [32]:
data['interval_date'] = data['interval_date'].apply(lambda x: round(x.total_seconds()/3600))
data['interval_date'] = data['interval_date'].apply(lambda x: int(x/24))

In [33]:
data.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,...,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,infection_category,necessary,un_necessary,interval_date
0,1000000001,2.0,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,...,75,2020-01-22,2020-01-23,2020-02-05,,released,overseas,1.0,1.0,1
3,1000000004,7.0,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,...,9,2020-01-26,2020-01-30,2020-02-15,,released,overseas,1.0,0.0,4
13,1000000014,30.0,female,1952,60s,Korea,Seoul,Jongno-gu,,contact with patient,...,27,2020-02-06,2020-02-16,2020-03-12,,released,individual,8.0,5.0,10
14,1000000015,40.0,male,1943,70s,Korea,Seoul,Seongdong-gu,,Seongdong-gu APT,...,8,2020-02-11,2020-02-19,,,released,group,5.0,4.0,8
301,1000000302,8669.0,male,2009,10s,Korea,Seoul,Dongjak-gu,,overseas inflow,...,8,2020-03-19,2020-03-20,,,released,overseas,1.0,0.0,1


## 3. 분류모델 디자인을 위한 데이터 생성

In [34]:
primary_cleansing_data = data[['patient_id', 'interval_date', 
                               'contact_number', 'necessary', 
                               'un_necessary', 'infection_category']]
primary_cleansing_data = primary_cleansing_data.reset_index(drop = True)
primary_cleansing_data.head()

Unnamed: 0,patient_id,interval_date,contact_number,necessary,un_necessary,infection_category
0,1000000001,1,75,1.0,1.0,overseas
1,1000000004,4,9,1.0,0.0,overseas
2,1000000014,10,27,8.0,5.0,individual
3,1000000015,8,8,5.0,4.0,group
4,1000000302,1,8,1.0,0.0,overseas


In [35]:
#infection_category numeric 변수로 변환
primary_cleansing_data_infection_numeric = pd.get_dummies(primary_cleansing_data['infection_category'])
pd.DataFrame(primary_cleansing_data_infection_numeric)
primary_cleansing_data.head()

Unnamed: 0,patient_id,interval_date,contact_number,necessary,un_necessary,infection_category
0,1000000001,1,75,1.0,1.0,overseas
1,1000000004,4,9,1.0,0.0,overseas
2,1000000014,10,27,8.0,5.0,individual
3,1000000015,8,8,5.0,4.0,group
4,1000000302,1,8,1.0,0.0,overseas


In [36]:
primary_cleansing_data = primary_cleansing_data.drop(['infection_category', 'patient_id'], axis=1)
primary_cleansing_data.head()

Unnamed: 0,interval_date,contact_number,necessary,un_necessary
0,1,75,1.0,1.0
1,4,9,1.0,0.0
2,10,27,8.0,5.0
3,8,8,5.0,4.0
4,1,8,1.0,0.0


In [37]:
primary_cleansing_data = pd.concat([primary_cleansing_data, primary_cleansing_data_infection_numeric], axis=1)
primary_cleansing_data.head()

Unnamed: 0,interval_date,contact_number,necessary,un_necessary,group,individual,overseas,unknown
0,1,75,1.0,1.0,0,0,1,0
1,4,9,1.0,0.0,0,0,1,0
2,10,27,8.0,5.0,0,1,0,0
3,8,8,5.0,4.0,1,0,0,0
4,1,8,1.0,0.0,0,0,1,0


In [38]:
# '-'포함된 데이터 숫자로
primary_cleansing_data.loc[(primary_cleansing_data['contact_number']=='-')]=1 

In [39]:
# 로 데이터 저장
primary_cleansing_data.to_csv("data_set_raw.csv")