# 💊 EDA for Drugs, Side Effects, and Medical Conditions

## Import dependencies

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import Dataset

In [7]:
df = pd.read_csv('drugs_side_effects_drugs_com.csv')
df.head()

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
0,doxycycline,Acne,"(hives, difficult breathing, swelling in your ...",doxycycline,"Miscellaneous antimalarials, Tetracyclines","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",87%,Rx,D,N,X,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,6.8,760.0,https://www.drugs.com/doxycycline.html,https://www.drugs.com/condition/acne.html
1,spironolactone,Acne,hives ; difficulty breathing; swelling of your...,spironolactone,"Aldosterone receptor antagonists, Potassium-sp...","Aldactone, CaroSpir",82%,Rx,C,N,X,amlodipine: https://www.drugs.com/amlodipine.h...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.2,449.0,https://www.drugs.com/spironolactone.html,https://www.drugs.com/condition/acne.html
2,minocycline,Acne,"skin rash, fever, swollen glands, flu-like sym...",minocycline,Tetracyclines,"Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",48%,Rx,D,N,,amoxicillin: https://www.drugs.com/amoxicillin...,Acne Other names: Acne Vulgaris; Blackheads; B...,5.7,482.0,https://www.drugs.com/minocycline.html,https://www.drugs.com/condition/acne.html
3,Accutane,Acne,problems with your vision or hearing; muscle o...,isotretinoin (oral),"Miscellaneous antineoplastics, Miscellaneous u...",,41%,Rx,X,N,X,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.9,623.0,https://www.drugs.com/accutane.html,https://www.drugs.com/condition/acne.html
4,clindamycin,Acne,hives ; difficult breathing; swelling of your ...,clindamycin topical,"Topical acne agents, Vaginal anti-infectives","Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",39%,Rx,B,N,,doxycycline: https://www.drugs.com/doxycycline...,Acne Other names: Acne Vulgaris; Blackheads; B...,7.4,146.0,https://www.drugs.com/mtm/clindamycin-topical....,https://www.drugs.com/condition/acne.html


## Data Cleaning

### Missing Values Check

In [8]:
df.isnull().sum()

drug_name                           0
medical_condition                   0
side_effects                      124
generic_name                       43
drug_classes                       82
brand_names                      1213
activity                            0
rx_otc                              1
pregnancy_category                229
csa                                 0
alcohol                          1554
related_drugs                    1469
medical_condition_description       0
rating                           1345
no_of_reviews                    1345
drug_link                           0
medical_condition_url               0
dtype: int64

#### Handle Missing Values

In [9]:
print(df.columns)

Index(['drug_name', 'medical_condition', 'side_effects', 'generic_name',
       'drug_classes', 'brand_names', 'activity', 'rx_otc',
       'pregnancy_category', 'csa', 'alcohol', 'related_drugs',
       'medical_condition_description', 'rating', 'no_of_reviews', 'drug_link',
       'medical_condition_url'],
      dtype='object')


#### Handling text based missing values

In [11]:
df['side_effects'].fillna("Unknown", inplace=True)
df['generic_name'].fillna("Unknown", inplace=True)
df['drug_classes'].fillna("Unknown", inplace=True)
df['brand_names'].fillna("Unknown", inplace=True)
df['rx_otc'].fillna("Unknown", inplace=True)
df['pregnancy_category'].fillna("Unknown", inplace=True)
df['alcohol'].fillna("Unknown", inplace=True)
df['related_drugs'].fillna("Unknown", inplace=True)


#### Handling numerical missing values

In [13]:
df['rating'].fillna("0", inplace=True)
df['no_of_reviews'].fillna("0", inplace=True)

#### Check missing values again

In [14]:
df.isnull().sum()

drug_name                        0
medical_condition                0
side_effects                     0
generic_name                     0
drug_classes                     0
brand_names                      0
activity                         0
rx_otc                           0
pregnancy_category               0
csa                              0
alcohol                          0
related_drugs                    0
medical_condition_description    0
rating                           0
no_of_reviews                    0
drug_link                        0
medical_condition_url            0
dtype: int64

### Check Duplicated Values

In [15]:
print(df.duplicated().sum())

0


### Convert 'rating' and 'no of reviews' data to numeric

In [16]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['no_of_reviews'] = pd.to_numeric(df['no_of_reviews'], errors='coerce')

print(df.dtypes.value_counts())

object     15
float64     2
Name: count, dtype: int64


### Activity - 'value'/100

In [22]:
df['activity']=df['activity'].astype(str).str.replace(r'\s+', '', regex=True).str.rstrip('%').astype('float')/100
print(df['activity'])

0       0.87
1       0.82
2       0.48
3       0.41
4       0.39
        ... 
2926    0.02
2927    0.01
2928    0.00
2929    0.00
2930    0.00
Name: activity, Length: 2931, dtype: float64


### Replace values of 'alcohol' to boolean

In [26]:
print(df['alcohol'].unique()) #Check unique values 'X' and 'Unknown'
df['alcohol']=df['alcohol'].replace({'X':1})
df['alcohol']=df['alcohol'].replace({'Unknown':0})

[1 0]


### Other Unique values

In [29]:
print(df['rx_otc'].unique())
print(df['pregnancy_category'].unique())

['Rx' 'Rx/OTC' 'OTC' 'Unknown']
['D' 'C' 'X' 'B' 'N' 'Unknown' 'A']


### Make a copy of the data and save into csv

In [33]:
df2=df.copy()
df2.to_csv('drugs_side_effects_drugs_com_cleaned.csv', index=False)

### Fixing value separators of columns - PONDER OVER THIS STEP FOR NOW

array(['X', 'Unknown'], dtype=object)

### Check unique values for some columns to see if NaN or Blanks left