In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

original_df = pd.read_csv('../Datasets/Combined_Dataset.csv')
original_df.head()

Unnamed: 0,State,Cov_Ent_Type,Individuals_Affected,Type_of_Breach,Loc_of_Breached_Information,Business_Associate_Present
0,CO,Healthcare Provider,26609.0,Hacking/IT Incident,Email,No
1,WI,Healthcare Provider,907.0,Unauthorized Access/Disclosure,Email,No
2,NH,Healthcare Provider,34878.0,Hacking/IT Incident,Network Server,No
3,CA,Healthcare Provider,500.0,Theft,Paper/Films,No
4,OH,Healthcare Provider,2716.0,Hacking/IT Incident,"Laptop, Network Server",No


# Handling Null Values

In [2]:
original_df.isnull().sum()
# Null values for each feature but Business_Associate_Present

State                            16
Cov_Ent_Type                   1087
Individuals_Affected             23
Type_of_Breach                    9
Loc_of_Breached_Information       6
Business_Associate_Present        0
dtype: int64

In [3]:
original_df.dtypes

State                           object
Cov_Ent_Type                    object
Individuals_Affected           float64
Type_of_Breach                  object
Loc_of_Breached_Information     object
Business_Associate_Present      object
dtype: object

In [4]:
imputer_state =  SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_state = imputer_state.fit(original_df[['State']])
original_df['State'] = imputer_state.transform(original_df[['State']])
# Verify that all the missing values have been handled
original_df['State'].isnull().sum()

0

In [5]:
imputer_covered_entity = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_covered_entity = imputer_covered_entity.fit(original_df[['Cov_Ent_Type']])
original_df['Cov_Ent_Type'] = imputer_covered_entity.transform(original_df[['Cov_Ent_Type']])
# Verifying that data for covered entity has been imputed
original_df['Cov_Ent_Type'].isnull().sum()

0

In [6]:
imputer_individuals_affected = SimpleImputer(missing_values=np.nan, strategy='mean')
original_df['Individuals_Affected'] = imputer_individuals_affected.fit_transform(original_df[['Individuals_Affected']])
# For Verification purposes only
original_df['Individuals_Affected'].isnull().sum()

0

In [7]:
imputer_type_of_breach = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
original_df['Type_of_Breach'] = imputer_type_of_breach.fit_transform(original_df[['Type_of_Breach']])
# Verifying that all missing values have been imputed
original_df['Type_of_Breach'].isnull().sum()

0

In [8]:
imputer_location_of_breached_info = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
original_df['Loc_of_Breached_Information'] = imputer_location_of_breached_info.fit_transform(original_df[['Loc_of_Breached_Information']])
# Verifying that the Feature Location of Breached Information has no null values
original_df['Loc_of_Breached_Information'].isnull().sum()

0

In [9]:
original_df.isnull().sum()

State                          0
Cov_Ent_Type                   0
Individuals_Affected           0
Type_of_Breach                 0
Loc_of_Breached_Information    0
Business_Associate_Present     0
dtype: int64

# Standardizing the Numercial Columns
- The below code is focusing on standardizing the Individuals_Affected Feature such that the mean of the values is 0 and the standard deviation is 1.
- This will add our model when we look to input these Features into the SVM Model for hypothesis testing and evaluation.

In [10]:
std = StandardScaler()
X = std.fit_transform(original_df[['Individuals_Affected']])

## Handling Categorial Variables
- All features within this dataset are considered to be `nominal categorical variables` since they are not based upon a distinct order.

In [11]:
original_df = pd.get_dummies(original_df[['State', 'Cov_Ent_Type', 'Type_of_Breach', 'Loc_of_Breached_Information', 'Business_Associate_Present']], drop_first=True)

In [12]:
original_df.corr()

Unnamed: 0,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_GA,...,"Loc_of_Breached_Information_Other Portable Electronic Device, Other","Loc_of_Breached_Information_Other Portable Electronic Device, Other, Electronic Medical Record","Loc_of_Breached_Information_Other Portable Electronic Device, Paper/Films","Loc_of_Breached_Information_Other, Electronic Medical Record","Loc_of_Breached_Information_Other, Other Portable Electronic Device","Loc_of_Breached_Information_Other, Paper","Loc_of_Breached_Information_Other, Paper/Films",Loc_of_Breached_Information_Paper,Loc_of_Breached_Information_Paper/Films,Business_Associate_Present_Yes
State_AL,1.000000,-0.012385,-0.014619,-0.036172,-0.014619,-0.014141,-0.007835,-0.007485,-0.027569,-0.018417,...,0.008558,-0.002305,-0.004613,-0.002823,0.010516,-0.003260,-0.005883,0.005115,0.017057,0.003924
State_AR,-0.012385,1.000000,-0.016295,-0.040319,-0.016295,-0.015762,-0.008733,-0.008343,-0.030729,-0.020528,...,0.005131,-0.002569,-0.005141,-0.003147,0.007043,-0.003634,-0.006558,-0.000828,0.000608,-0.009382
State_AZ,-0.014619,-0.016295,1.000000,-0.047593,-0.019235,-0.018605,-0.010309,-0.009848,-0.036273,-0.024232,...,0.031400,-0.003032,-0.006069,-0.003714,0.035207,-0.004289,-0.007741,0.013302,0.029227,-0.042233
State_CA,-0.036172,-0.040319,-0.047593,1.000000,-0.047593,-0.046034,-0.025507,-0.024367,-0.089749,-0.059956,...,0.002898,-0.007503,-0.015016,-0.009190,0.007743,-0.010613,-0.019153,0.010601,0.003785,-0.032545
State_CO,-0.014619,-0.016295,-0.019235,-0.047593,1.000000,-0.018605,-0.010309,-0.009848,-0.036273,-0.024232,...,-0.015706,-0.003032,-0.006069,-0.003714,-0.014779,-0.004289,-0.007741,0.005550,-0.008439,-0.013654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Loc_of_Breached_Information_Other, Paper",-0.003260,-0.003634,-0.004289,-0.010613,-0.004289,-0.004149,-0.002299,-0.002196,0.023496,-0.005404,...,-0.003502,-0.000676,-0.001353,-0.000828,-0.003296,1.000000,-0.001726,-0.007406,-0.011481,-0.017611
"Loc_of_Breached_Information_Other, Paper/Films",-0.005883,-0.006558,-0.007741,-0.019153,-0.007741,-0.007487,-0.004149,-0.003963,0.020480,-0.009752,...,-0.006321,-0.001220,-0.002442,-0.001495,-0.005948,-0.001726,1.000000,-0.013365,-0.020720,-0.011814
Loc_of_Breached_Information_Paper,0.005115,-0.000828,0.013302,0.010601,0.005550,-0.008108,0.025012,-0.017003,0.010630,-0.029394,...,-0.027116,-0.005235,-0.010478,-0.006413,-0.025517,-0.007406,-0.013365,1.000000,-0.088891,0.015739
Loc_of_Breached_Information_Paper/Films,0.017057,0.000608,0.029227,0.003785,-0.008439,-0.027575,0.021933,-0.005632,0.022554,0.012876,...,-0.042038,-0.008116,-0.016244,-0.009942,-0.039559,-0.011481,-0.020720,-0.088891,1.000000,0.015076
