In [1]:
import pandas as pd
from random import randint
from math import log
import json

In [2]:
Df = pd.read_csv("./databreach.csv")
Df["Date"] = pd.to_datetime(Df["Date"])
Df["data_classes"] = [ i.split(",") for i in Df["data_classes"] ]

In [3]:
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes
0,2017-10-01,Yahoo,web services,3000.0,8600,[Security questions and answers]
1,2018-03-01,UIDAI,public welfare,1100.0,383,"[Names, Aadhar number, Bank account numbers]"
2,2014-12-01,Yahoo,web services,500.0,8600,"[Names, Email addresses, Phone numbers, Passwo..."
3,2013-06-01,MySpace,social media,360.0,150,"[Names, Usernames, Dates of birth]"
4,2018-06-01,Exactis,web services,340.0,22,"[Phone numbers, Physical addresses, Email addr..."


## Bar  graph of Data-Breaches over Time:


In [4]:
Df["year"] = Df.Date.dt.year
Df_year = pd.DataFrame(Df.groupby("year")['breach_count(Million)'].agg(["count", "sum"]))

In [5]:
info_lost = {}
for i in Df["year"]:
    info_lost[i] = 0
for num, i in enumerate(Df["year"]):
    info_lost[i] += Df["breach_count(Million)"][num]*1e6*len( Df["data_classes"] )
temp = []
for i in range(2012,2022):
    try:
#         print(i)
#         print(info_lost[i])
        temp.append(info_lost[i])
    except:
        pass
Df_year["info_lost"] = temp
Df_year.columns = ["Number of Data Breaches", "Individuals impacted (Millions)",
                          "Cumulative Exposure Index"]

In [6]:
Df_year

Unnamed: 0_level_0,Number of Data Breaches,Individuals impacted (Millions),Cumulative Exposure Index
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,1,69.0,2277000000.0
2013,4,878.0,28974000000.0
2014,2,645.0,21285000000.0
2015,1,0.15,4950000.0
2016,3,180.2,5946600000.0
2017,2,3148.0,103884000000.0
2018,6,2146.0,70818000000.0
2019,3,240.5,7936500000.0
2020,4,40.8,1346400000.0
2021,7,234.2015,7728650000.0


In [7]:
Df_year.to_csv("./final/year_bar_chart.csv")

### generate mock data for missing employee counts (No longer used)

In [8]:
temp = []
for i in Df["employee_count"]:
    try:
        temp.append(int(i))
    except:
        temp.append(randint(0,1000))
Df["employee_count"] = temp

### scatter plot and exposure index

In [9]:
Df['exposure_index'] = [
                        log((Df["breach_count(Million)"][num])*len(i))
                        for num, i in enumerate(Df["data_classes"])
                        ]
Df_scatter = Df[["breach_count(Million)","exposure_index", "employee_count", "category", "Org","Date", "data_classes"]]
Df_scatter["exposure_index"] -= min(Df_scatter["exposure_index"])
Df_scatter.columns = ["breach_count(Million)","Relative Exposure Index", "employee_count", "category", "Org","Date", "data_classes"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
Df_scatter.head()

Unnamed: 0,breach_count(Million),Relative Exposure Index,employee_count,category,Org,Date,data_classes
0,3000.0,12.89922,8600,web services,Yahoo,2017-10-01,[Security questions and answers]
1,1100.0,12.99453,383,public welfare,UIDAI,2018-03-01,"[Names, Aadhar number, Bank account numbers]"
2,500.0,12.89922,8600,web services,Yahoo,2014-12-01,"[Names, Email addresses, Phone numbers, Passwo..."
3,360.0,11.877569,150,social media,MySpace,2013-06-01,"[Names, Usernames, Dates of birth]"
4,340.0,12.513557,22,web services,Exactis,2018-06-01,"[Phone numbers, Physical addresses, Email addr..."


In [11]:
Df_scatter.to_csv("./final/scatter_data.csv")

### network graph


In [12]:
#  importing categories
Df_cats = pd.read_csv("./Classes_to_Cats.csv")
Df_cats["Data_category"] = [i.split(",") for i in Df_cats["Data_category"]]
Df_cats.head()

Unnamed: 0,Data_classes,Data_category
0,Email addresses,[PII]
1,Passwords,[passwords]
2,Usernames,[PII]
3,IP addresses,[device info]
4,Names,[PII]


In [13]:
# unique cats
unique_cats = {}
for i in Df_cats["Data_category"]:
    for j in i:
        if j not in unique_cats:
            unique_cats[j] = 0
unique_cats = list(unique_cats.keys())
unique_cats

['PII',
 'passwords',
 'device info',
 'demographics',
 'geolocation',
 'consumer behavior',
 'personal communication',
 'employment history',
 'financial info',
 'health',
 'social',
 'password']

In [14]:
class_to_cat = {}
for i in Df_cats.iterrows():
    class_to_cat[i[1][0]] = i[1][1][0]

In [15]:
# data_cats exposed
cats_exposed = []
for i in Df.data_classes:
    temp = []
    for j in i:
        if class_to_cat[j] not in temp:
            temp.append(class_to_cat[j])
    cats_exposed.append(temp)
Df["data_cats"] = cats_exposed
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes,year,exposure_index,data_cats
0,2017-10-01,Yahoo,web services,3000.0,8600,[Security questions and answers],2017,8.006368,[passwords]
1,2018-03-01,UIDAI,public welfare,1100.0,383,"[Names, Aadhar number, Bank account numbers]",2018,8.101678,"[PII, financial info]"
2,2014-12-01,Yahoo,web services,500.0,8600,"[Names, Email addresses, Phone numbers, Passwo...",2014,8.006368,"[PII, passwords, demographics]"
3,2013-06-01,MySpace,social media,360.0,150,"[Names, Usernames, Dates of birth]",2013,6.984716,"[PII, demographics]"
4,2018-06-01,Exactis,web services,340.0,22,"[Phone numbers, Physical addresses, Email addr...",2018,7.620705,"[PII, geolocation, consumer behavior, demograp..."


In [16]:
final_network_data = []

In [17]:
for i in unique_cats:
    count = 0
    for j in unique_cats:
        if i != j:
            for point in Df.data_cats:
                if (i in point) and (j in point):
                    final_network_data.append({
                                    "data":{
                                            'source': i.lower(),
                                            'target': j.lower()
                                            }
                                })
                    count += 1
    if count:
        final_network_data.append({
                            'data':{
                                    'id': i.lower(),
                                    'label':i
                                    }
                            })

In [18]:
final_network_data

[{'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'passwords'}},
 {'data': {'source': 'pii', 'target': 'device info'}},
 {'data': {'source': 'pii', 'target': 'device info'}},
 {'data': {'source': 'pii', 'target': 'demographics'}},
 {'data': {'source': 'pii', 'target': 'demographics'}},
 {'data': {'source': 'pii', 'target'

In [19]:
# saving json
json_object = json.dumps(final_network_data, indent = 4)
with open("./final/network_data.json", "w") as outfile:
    outfile.write(json_object)

### network graph cleaned cats

In [20]:
Df_cleaned = pd.read_csv("./Classes_to_Cats_revised.csv")
Df_cleaned = Df_cleaned[["Data_classes", "Data_classes_shorterned_cleaned"]]

In [21]:
Df_cleaned

Unnamed: 0,Data_classes,Data_classes_shorterned_cleaned
0,Aadhar number,aadhar
1,Account balances,account balances
2,Physical addresses,address
3,Dates of birth,age
4,Partial dates of birth,age
...,...,...
129,User statuses,profile data
130,User website URLs,profile data
131,Usernames,usernames
132,Utility bills,bills


In [22]:
unique_cleaned_cats = {}
for i in Df_cleaned["Data_classes_shorterned_cleaned"]:
    if i not in unique_cleaned_cats:
        unique_cleaned_cats[i] = 0
unique_cleaned_cats = list(unique_cleaned_cats.keys())

In [23]:
class_to_clean = {}
for i in Df_cleaned.iterrows():
    class_to_clean[i[1][0]] = i[1][1]

In [24]:
clean_exposed = []
for i in Df.data_classes:
    temp = []
    for j in i:
        if class_to_clean[j] not in temp:
            temp.append(class_to_clean[j])
    clean_exposed.append(temp)
Df["data_clean"] = clean_exposed
Df.head()

Unnamed: 0,Date,Org,category,breach_count(Million),employee_count,data_classes,year,exposure_index,data_cats,data_clean
0,2017-10-01,Yahoo,web services,3000.0,8600,[Security questions and answers],2017,8.006368,[passwords],[passwords]
1,2018-03-01,UIDAI,public welfare,1100.0,383,"[Names, Aadhar number, Bank account numbers]",2018,8.101678,"[PII, financial info]","[names, aadhar , bank account ]"
2,2014-12-01,Yahoo,web services,500.0,8600,"[Names, Email addresses, Phone numbers, Passwo...",2014,8.006368,"[PII, passwords, demographics]","[names, email addresses, phone, passwords, age]"
3,2013-06-01,MySpace,social media,360.0,150,"[Names, Usernames, Dates of birth]",2013,6.984716,"[PII, demographics]","[names, usernames, age]"
4,2018-06-01,Exactis,web services,340.0,22,"[Phone numbers, Physical addresses, Email addr...",2018,7.620705,"[PII, geolocation, consumer behavior, demograp...","[phone, address, email addresses, interests, f..."


In [25]:
final_clean_data = {"nodes":[],"edges":[]}
already_connected = { i:[] for i in unique_cleaned_cats }

In [26]:
for i in unique_cleaned_cats:
    count = 0
    for j in unique_cleaned_cats:
        if i != j:
            for point in Df.data_clean:
                if (i in point) and (j in point):
                    if (j not in already_connected[i]):
                        final_clean_data["edges"].append({
                                        "id":i.lower()+"-"+j.lower(),
                                        "from":i.lower(),
                                        "to":j.lower()
                                    })
                        already_connected[i].append(j)
                        already_connected[j].append(i)
                    count += 1
    if count:
        final_clean_data["nodes"].append({
                            'id':i.lower(),
                            'label': i
                            })

In [27]:
final_clean_data

{'nodes': [{'id': 'aadhar ', 'label': 'aadhar '},
  {'id': 'account balances', 'label': 'account balances'},
  {'id': 'address', 'label': 'address'},
  {'id': 'age', 'label': 'age'},
  {'id': 'bank account ', 'label': 'bank account '},
  {'id': 'buying preferences', 'label': 'buying preferences'},
  {'id': 'credit card ', 'label': 'credit card '},
  {'id': 'date joined', 'label': 'date joined'},
  {'id': 'debit card ', 'label': 'debit card '},
  {'id': 'device info', 'label': 'device info'},
  {'id': 'passwords', 'label': 'passwords'},
  {'id': 'email addresses', 'label': 'email addresses'},
  {'id': 'family data', 'label': 'family data'},
  {'id': 'financial transactions', 'label': 'financial transactions'},
  {'id': 'gender', 'label': 'gender'},
  {'id': 'health data', 'label': 'health data'},
  {'id': 'identity card', 'label': 'identity card'},
  {'id': 'interests', 'label': 'interests'},
  {'id': 'job profile', 'label': 'job profile'},
  {'id': 'last login date', 'label': 'last log

In [28]:
json_object = json.dumps(final_clean_data, indent = 4)
with open("./final/network_clean_data.json", "w") as outfile:
    outfile.write(json_object)

## Pie and bubble plot across Organisation Categories

In [29]:
Df_orgcat = Df.groupby(["category"]).agg(["count","sum"])["breach_count(Million)"]
Df_orgcat.columns = ["Number of Breaches","Victim Count (Million)"]
Df_orgcat["Bubble Size"] = Df_orgcat["Number of Breaches"] + 4
Df_orgcat

Unnamed: 0_level_0,Number of Breaches,Victim Count (Million),Bubble Size
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
airlines,1,4.5,5
e commerce,7,516.0,11
education,1,20.0,5
entertainment,2,85.3,6
health,2,276.0,6
public welfare,3,1100.5015,7
social media,4,856.0,8
web services,13,4723.55,17


In [30]:
Df_orgcat.to_csv("./final/Org_cats.csv")