In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

# 코로나 백신 부작용 증상 목록 생성

In [2]:
SYMPTOMS_df = pd.DataFrame()
VAX_df = pd.DataFrame()

startYear = 2020 # 통합할 데이터의 시작연도
numOfFiles = 2 # 파일의 갯수
for i in tqdm(range(0, numOfFiles, 1)):
    symptoms_df = pd.read_csv('./dataset/AllVAERSDataCSVS/' + str(startYear+i) + 'VAERSSYMPTOMS.csv', encoding='latin')
    vax_df = pd.read_csv('./dataset/AllVAERSDataCSVS/' + str(startYear+i) + 'VAERSVAX.csv', encoding='latin')
    
    SYMPTOMS_df = pd.concat([SYMPTOMS_df, symptoms_df])
    VAX_df = pd.concat([VAX_df, vax_df])
    
    del symptoms_df
    del vax_df

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.41it/s]


In [3]:
# 필요한 피처만 남기고 제거
SYMPTOMS_df = SYMPTOMS_df[['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']]
VAX_df = VAX_df[['VAERS_ID', 'VAX_TYPE']]

In [4]:
# VAERS_ID 기준으로 inner join
VAERS_df = pd.merge(SYMPTOMS_df, VAX_df, how='inner', on='VAERS_ID')
VAERS_df.head()

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,VAX_TYPE
0,855017,Arthralgia,Chills,Injection site pain,Pyrexia,,VARZOS
1,855018,Chills,Fatigue,Hypertension,Hypoaesthesia,Injected limb mobility decreased,UNK
2,855018,Muscular weakness,Pain in extremity,Pyrexia,Tremor,Vertigo,UNK
3,855019,Pain,Pruritus,Rash,,,VARZOS
4,855020,Chills,Influenza like illness,Myalgia,Pain in extremity,Pyrexia,VARZOS


In [5]:
# COVID19인 행만 추출
VAERS_df = VAERS_df[VAERS_df['VAX_TYPE'] == 'COVID19']
VAERS_df.head()

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,VAX_TYPE
56963,902418,Hypoaesthesia,Injection site hypoaesthesia,,,,COVID19
56983,902440,Headache,,,,,COVID19
56990,902446,Erythema,Feeling hot,Flushing,,,COVID19
57013,902464,Dizziness,Electrocardiogram normal,Hyperhidrosis,Laboratory test normal,Presyncope,COVID19
57014,902465,Dysgeusia,Oral pruritus,Paraesthesia,Paraesthesia oral,Parosmia,COVID19


In [6]:
covid_symptom_2list = VAERS_df[['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']].values

In [7]:
# 2차원 리스트를 1차원으로
covid_symptom_list = [element for array in covid_symptom_2list for element in array]

In [8]:
import collections
covid_rank_list = collections.Counter(covid_symptom_list).most_common()[1:]
covid_rank_list

[('Headache', 104383),
 ('Pyrexia', 86837),
 ('Fatigue', 85767),
 ('Chills', 77515),
 ('Pain', 74016),
 ('Dizziness', 60425),
 ('Nausea', 59776),
 ('Pain in extremity', 55030),
 ('Myalgia', 35636),
 ('Injection site pain', 35171),
 ('Arthralgia', 33633),
 ('Rash', 30556),
 ('Dyspnoea', 30176),
 ('Pruritus', 28354),
 ('Injection site erythema', 25446),
 ('Vomiting', 23085),
 ('Asthenia', 22481),
 ('Injection site swelling', 20568),
 ('Erythema', 19500),
 ('Hyperhidrosis', 18957),
 ('Paraesthesia', 18938),
 ('Diarrhoea', 18938),
 ('COVID-19', 18145),
 ('Injection site pruritus', 18096),
 ('Feeling abnormal', 17385),
 ('Hypoaesthesia', 17329),
 ('Urticaria', 17063),
 ('Malaise', 16196),
 ('Lymphadenopathy', 16029),
 ('Chest pain', 15415),
 ('Cough', 14779),
 ('Peripheral swelling', 14090),
 ('SARS-CoV-2 test positive', 13598),
 ('Injection site warmth', 13078),
 ('Syncope', 12167),
 ('Chest discomfort', 11596),
 ('Vaccination site pain', 11406),
 ('Tremor', 11090),
 ('Blood test', 11070),

In [9]:
covid_side_effect_list = []

# 코로나 백신 부작용 추출
for i in range(len(covid_rank_list)):
    if covid_rank_list[i][1] >= 10:
        covid_side_effect_list.append(covid_rank_list[i][0])

In [10]:
f = open('./dataset/covid_side_effect_list.txt', 'w')
f.write('\n'.join(covid_side_effect_list))
f.close()

# 모든 백신 증상 부작용 목록생성

In [11]:
SYMPTOMS_df = pd.DataFrame()
VAX_df = pd.DataFrame()

startYear = 1990 # 통합할 데이터의 시작연도
numOfFiles = 32 # 파일의 갯수
for i in tqdm(range(0, numOfFiles, 1)):
    symptoms_df = pd.read_csv('./dataset/AllVAERSDataCSVS/' + str(startYear+i) + 'VAERSSYMPTOMS.csv', encoding='latin')
    vax_df = pd.read_csv('./dataset/AllVAERSDataCSVS/' + str(startYear+i) + 'VAERSVAX.csv', encoding='latin')
    
    SYMPTOMS_df = pd.concat([SYMPTOMS_df, symptoms_df])
    VAX_df = pd.concat([VAX_df, vax_df])
    
    del symptoms_df
    del vax_df

100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:05<00:00,  5.61it/s]


In [12]:
# 필요한 피처만 남기고 제거
SYMPTOMS_df = SYMPTOMS_df[['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']]
VAX_df = VAX_df[['VAERS_ID', 'VAX_TYPE']]

In [13]:
# VAERS_ID 기준으로 inner join
VAERS_df = pd.merge(SYMPTOMS_df, VAX_df, how='inner', on='VAERS_ID')
VAERS_df.head()

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,VAX_TYPE
0,25001,Agitation,,,,,DTP
1,25003,Delirium,Hypokinesia,Hypotonia,,,DTP
2,25003,Delirium,Hypokinesia,Hypotonia,,,OPV
3,25004,Chills,Dermatitis contact,Oedema genital,Pelvic pain,,OPV
4,25005,Arthritis,Injection site oedema,Injection site reaction,,,TD


In [14]:
all_symptom_2list = VAERS_df[['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']].values

In [15]:
# 2차원 리스트를 1차원으로
all_symptom_list = [element for array in all_symptom_2list for element in array]

In [16]:
all_rank_list = collections.Counter(all_symptom_list).most_common()[1:]
all_rank_list

[('Pyrexia', 282489),
 ('Headache', 163599),
 ('Injection site erythema', 162941),
 ('Pain', 152296),
 ('Injection site pain', 123844),
 ('Fatigue', 119372),
 ('Chills', 116802),
 ('Rash', 112731),
 ('Injection site swelling', 109809),
 ('Nausea', 103795),
 ('Pain in extremity', 102071),
 ('Dizziness', 101419),
 ('Erythema', 100870),
 ('No adverse event', 89461),
 ('Pruritus', 79044),
 ('Vomiting', 74242),
 ('Urticaria', 69909),
 ('Injection site warmth', 67053),
 ('Myalgia', 66561),
 ('Arthralgia', 57929),
 ('Dyspnoea', 55409),
 ('Asthenia', 47609),
 ('Diarrhoea', 46330),
 ('Malaise', 40349),
 ('Injection site oedema', 38734),
 ('Swelling', 37831),
 ('Injection site pruritus', 35975),
 ('Paraesthesia', 35973),
 ('Cough', 34090),
 ('Syncope', 33757),
 ('Hyperhidrosis', 33143),
 ('Injection site induration', 31928),
 ('Skin warm', 31418),
 ('Crying', 30828),
 ('Hypoaesthesia', 30514),
 ('Incorrect product storage', 30416),
 ('Injection site hypersensitivity', 29869),
 ('Tremor', 28755),

In [19]:
all_side_effect_list = []

# 모든 백신 부작용 추출
for i in range(len(all_rank_list)):
    if all_rank_list[i][1] >= 10:
        all_side_effect_list.append(all_rank_list[i][0])

In [20]:
f = open('./dataset/all_side_effect_list.txt', 'w')
f.write('\n'.join(all_side_effect_list))
f.close()

# 모든 백신 부작용 ∩ 코로나 백신 부작용c

In [21]:
except_covid_side_effect_no_rank_list = list(set(all_side_effect_list) - set(covid_side_effect_list))

In [22]:
except_covid_rank_list = []

for i in range(len(all_rank_list)):
    if all_rank_list[i][0] in except_covid_side_effect_no_rank_list:
        except_covid_rank_list.append(all_rank_list[i])

In [29]:
except_covid_rank_list

[('Incorrect product storage', 30416),
 ('Convulsion', 25598),
 ('Inappropriate schedule of drug administration', 11500),
 ('Rash generalised', 10455),
 ('Wrong drug administered', 9834),
 ('Anorexia', 8246),
 ('Incorrect storage of drug', 7195),
 ('Drug administered to patient of inappropriate age', 6175),
 ('Local swelling', 5485),
 ('Expired drug administered', 4752),
 ('Intussusception', 4160),
 ('Nuclear magnetic resonance imaging', 3412),
 ('Varicella post vaccine', 3347),
 ('Activities of daily living impaired', 3341),
 ('Sudden infant death syndrome', 3099),
 ('Autism', 3051),
 ('Drug exposure during pregnancy', 3041),
 ('Skin nodule', 2516),
 ('Pharyngolaryngeal pain', 2361),
 ('Pruritus generalised', 2317),
 ('Incorrect route of drug administration', 2259),
 ('Grand mal convulsion', 2173),
 ('Abasia', 2113),
 ('Nuclear magnetic resonance imaging normal', 1897),
 ('Myasthenic syndrome', 1707),
 ('Facial palsy', 1700),
 ('Nuclear magnetic resonance imaging abnormal', 1305),
 ('

In [24]:
except_covid_side_effect_list = []

# 코로나 백신 부작용을 제외한 목록
for i in range(len(except_covid_rank_list)):
    except_covid_side_effect_list.append(except_covid_rank_list[i][0])

In [25]:
f = open('./dataset/except_covid_side_effect_list.txt', 'w')
f.write('\n'.join(except_covid_side_effect_list))
f.close()

In [30]:
len(except_covid_rank_list)

1956