In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

original_df = pd.read_csv('../Datasets/Combined_Dataset.csv')
original_df.head()

Unnamed: 0,State,Cov_Ent_Type,Individuals_Affected,Type_of_Breach,Loc_of_Breached_Information,Business_Associate_Present
0,CO,Healthcare Provider,26609.0,Hacking/IT Incident,Email,No
1,WI,Healthcare Provider,907.0,Unauthorized Access/Disclosure,Email,No
2,NH,Healthcare Provider,34878.0,Hacking/IT Incident,Network Server,No
3,CA,Healthcare Provider,500.0,Theft,Paper/Films,No
4,OH,Healthcare Provider,2716.0,Hacking/IT Incident,"Laptop, Network Server",No


# Handling Null Values

In [4]:
original_df.isnull().sum()
# Null values for each feature but Business_Associate_Present

State                            16
Cov_Ent_Type                   1087
Individuals_Affected             23
Type_of_Breach                    9
Loc_of_Breached_Information       6
Business_Associate_Present        0
dtype: int64

In [6]:
original_df.dtypes

State                           object
Cov_Ent_Type                    object
Individuals_Affected           float64
Type_of_Breach                  object
Loc_of_Breached_Information     object
Business_Associate_Present      object
dtype: object

In [12]:
imputer_state =  SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_state = imputer_state.fit(original_df[['State']])
original_df['State'] = imputer_state.transform(original_df[['State']])
# Verify that all the missing values have been handled
original_df['State'].isnull().sum()

0

In [15]:
imputer_covered_entity = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_covered_entity = imputer_covered_entity.fit(original_df[['Cov_Ent_Type']])
original_df['Cov_Ent_Type'] = imputer_covered_entity.transform(original_df[['Cov_Ent_Type']])
# Verifying that data for covered entity has been imputed
original_df['Cov_Ent_Type'].isnull().sum()

0

In [17]:
imputer_individuals_affected = SimpleImputer(missing_values=np.nan, strategy='mean')
original_df['Individuals_Affected'] = imputer_individuals_affected.fit_transform(original_df[['Individuals_Affected']])
# For Verification purposes only
original_df['Individuals_Affected'].isnull().sum()

0

In [19]:
imputer_type_of_breach = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
original_df['Type_of_Breach'] = imputer_type_of_breach.fit_transform(original_df[['Type_of_Breach']])
# Verifying that all missing values have been imputed
original_df['Type_of_Breach'].isnull().sum()

0

In [20]:
imputer_location_of_breached_info = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
original_df['Loc_of_Breached_Information'] = imputer_location_of_breached_info.fit_transform(original_df[['Loc_of_Breached_Information']])
# Verifying that the Feature Location of Breached Information has no null values
original_df['Loc_of_Breached_Information'].isnull().sum()

0

In [21]:
original_df.isnull().sum()

State                          0
Cov_Ent_Type                   0
Individuals_Affected           0
Type_of_Breach                 0
Loc_of_Breached_Information    0
Business_Associate_Present     0
dtype: int64

# Standardizing the Numercial Columns
- The below code is focusing on standardizing the Individuals_Affected Feature such that the mean of the values is 0 and the standard deviation is 1.
- This will add our model when we look to input these Features into the SVM Model for hypothesis testing and evaluation.

In [24]:
std = StandardScaler()
X = std.fit_transform(original_df[['Individuals_Affected']])

[[-0.03465029]
 [-0.05462788]
 [-0.02822298]
 ...
 [-0.02812816]
 [-0.05303835]
 [-0.0548906 ]]


# Handling Categorial Variables