In [1]:
import pandas as pd
from random import randint
from math import log
import json

In [2]:
Df = pd.read_csv("./databreach.csv")
Df["Date"] = pd.to_datetime(Df["Date"])
Df["data_classes"] = [ i.split(",") for i in Df["data_classes"] ]

In [3]:
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes
0,2017-10-01,Yahoo,web services,3000.0,8600.0,[Security questions and answers]
1,2018-03-01,UIDAI,public welfare,1100.0,,"[Names, Aadhar number, Bank account numbers]"
2,2014-01-01,Yahoo,web services,500.0,8600.0,"[Names, Email addresses, Phone numbers, Passwo..."
3,2013-06-01,MySpace,social media,360.0,150.0,"[Names, Usernames, Dates of birth]"
4,2018-06-01,Exactis,web services,340.0,,"[Phone numbers, Physical addresses, Email addr..."


### group by year

In [4]:
Df["year"] = Df.Date.dt.year
Df_year = pd.DataFrame(Df.groupby("year")["Date"].agg("count"))
Df = Df.drop(["year"], axis=1)


In [5]:
Df_year.head()

Unnamed: 0_level_0,Date
year,Unnamed: 1_level_1
2012,1
2013,3
2014,2
2016,3
2017,2


In [6]:
Df_year.to_csv("./final/year_bar_chart.csv")

### generate mock data for missing employee counts

In [7]:
temp = []
for i in Df["employee_count"]:
    try:
        temp.append(int(i))
    except:
        temp.append(randint(0,1000))
Df["employee_count"] = temp

### scatter plot and exposure index

In [8]:
log(3)

1.0986122886681098

In [9]:
Df['exposure_index'] = [
                        log((Df["breach_count(Million)"][num])*len(i))
                        for num, i in enumerate(Df["data_classes"])
                        ]
Df_scatter = Df[["breach_count(Million)","exposure_index", "employee_count", "category", "Org","Date"]]


In [10]:
Df_scatter.head()

Unnamed: 0,breach_count(Million),exposure_index,employee_count,category,Org,Date
0,3000.0,8.006368,8600,web services,Yahoo,2017-10-01
1,1100.0,8.101678,722,public welfare,UIDAI,2018-03-01
2,500.0,8.006368,8600,web services,Yahoo,2014-01-01
3,360.0,6.984716,150,social media,MySpace,2013-06-01
4,340.0,7.620705,19,web services,Exactis,2018-06-01


In [11]:
Df_scatter.to_csv("./final/scatter_data.csv")

### network graph


In [12]:
#  importing categories
Df_cats = pd.read_csv("./Classes_to_Cats.csv")
Df_cats["Data_category"] = [i.split(",") for i in Df_cats["Data_category"]]
Df_cats.head()

Unnamed: 0,Data_classes,Data_category
0,Email addresses,[PII]
1,Passwords,[passwords]
2,Usernames,[PII]
3,IP addresses,[device info]
4,Names,[PII]


In [13]:
# unique cats
unique_cats = {}
for i in Df_cats["Data_category"]:
    for j in i:
        if j not in unique_cats:
            unique_cats[j] = 0
unique_cats = list(unique_cats.keys())
unique_cats

['PII',
 'passwords',
 'device info',
 'age',
 'geolocation',
 'demographics',
 'consumer behavior',
 'personal communication',
 'employment history',
 'financial info',
 'health',
 'social',
 'password']

In [14]:
class_to_cat = {}
for i in Df_cats.iterrows():
    class_to_cat[i[1][0]] = i[1][1][0]

In [15]:
# data_cats exposed
cats_exposed = []
for i in Df.data_classes:
    temp = []
    for j in i:
        if class_to_cat[j] not in temp:
            temp.append(class_to_cat[j])
    cats_exposed.append(temp)
Df["data_cats"] = cats_exposed
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes,exposure_index,data_cats
0,2017-10-01,Yahoo,web services,3000.0,8600,[Security questions and answers],8.006368,[passwords]
1,2018-03-01,UIDAI,public welfare,1100.0,722,"[Names, Aadhar number, Bank account numbers]",8.101678,"[PII, financial info]"
2,2014-01-01,Yahoo,web services,500.0,8600,"[Names, Email addresses, Phone numbers, Passwo...",8.006368,"[PII, passwords, age]"
3,2013-06-01,MySpace,social media,360.0,150,"[Names, Usernames, Dates of birth]",6.984716,"[PII, age]"
4,2018-06-01,Exactis,web services,340.0,19,"[Phone numbers, Physical addresses, Email addr...",7.620705,"[PII, geolocation, consumer behavior, demograp..."


In [16]:
final_network_data = []

In [17]:
for i in unique_cats:
    count = 0
    for j in unique_cats:
        if i != j:
            for point in Df.data_cats:
                if (i in point) and (j in point):
                    final_network_data.append({
                                    "data":{
                                            'source': i.lower(),
                                            'target': j.lower()
                                            }
                                })
                    count += 1
    if count:
        final_network_data.append({
                            'data':{
                                    'id': i.lower(),
                                    'label':i
                                    }
                            })

In [18]:
# saving json
json_object = json.dumps(final_network_data, indent = 4)
with open("./final/network_data.json", "w") as outfile:
    outfile.write(json_object)

### network graph cleaned cats

In [19]:
Df_cleaned = pd.read_csv("./Classes_to_Cats_revised.csv")
Df_cleaned = Df_cleaned[["Data_classes", "Data_classes_shorterned_cleaned"]]

In [20]:
Df_cleaned

Unnamed: 0,Data_classes,Data_classes_shorterned_cleaned
0,Aadhar number,aadhar
1,Account balances,account balances
2,Physical addresses,address
3,Dates of birth,age
4,Partial dates of birth,age
...,...,...
129,User statuses,profile data
130,User website URLs,profile data
131,Usernames,usernames
132,Utility bills,bills


In [21]:
unique_cleaned_cats = {}
for i in Df_cleaned["Data_classes_shorterned_cleaned"]:
    if i not in unique_cleaned_cats:
        unique_cleaned_cats[i] = 0
unique_cleaned_cats = list(unique_cleaned_cats.keys())

In [22]:
class_to_clean = {}
for i in Df_cleaned.iterrows():
    class_to_clean[i[1][0]] = i[1][1]

In [23]:
clean_exposed = []
for i in Df.data_classes:
    temp = []
    for j in i:
        if class_to_clean[j] not in temp:
            temp.append(class_to_clean[j])
    clean_exposed.append(temp)
Df["data_clean"] = clean_exposed
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes,exposure_index,data_cats,data_clean
0,2017-10-01,Yahoo,web services,3000.0,8600,[Security questions and answers],8.006368,[passwords],[passwords]
1,2018-03-01,UIDAI,public welfare,1100.0,722,"[Names, Aadhar number, Bank account numbers]",8.101678,"[PII, financial info]","[names, aadhar , bank account ]"
2,2014-01-01,Yahoo,web services,500.0,8600,"[Names, Email addresses, Phone numbers, Passwo...",8.006368,"[PII, passwords, age]","[names, email addresses, phone, passwords, age]"
3,2013-06-01,MySpace,social media,360.0,150,"[Names, Usernames, Dates of birth]",6.984716,"[PII, age]","[names, usernames, age]"
4,2018-06-01,Exactis,web services,340.0,19,"[Phone numbers, Physical addresses, Email addr...",7.620705,"[PII, geolocation, consumer behavior, demograp...","[phone, address, email addresses, interests, f..."


In [24]:
final_clean_data = []

In [25]:
for i in unique_cleaned_cats:
    count = 0
    for j in unique_cleaned_cats:
        if i != j:
            for point in Df.data_clean:
                if (i in point) and (j in point):
                    final_clean_data.append({
                                    "data":{
                                            'source': i.lower(),
                                            'target': j.lower()
                                            }
                                })
                    count += 1
    if count:
        final_clean_data.append({
                            'data':{
                                    'id': i.lower(),
                                    'label':i
                                    }
                            })

In [26]:
json_object = json.dumps(final_clean_data, indent = 4)
with open("./final/network_clean_data.json", "w") as outfile:
    outfile.write(json_object)