In [214]:
%%time 

from py2neo import Graph, Node, Relationship
import pandas as pd

g = Graph("http://localhost:7474/db/data/", user="neo4j", password="test")

# Read the microsoft-malware-prediction training dataset (sample only the first 40000 rows)
df = pd.read_csv("microsoft-malware-prediction/train.csv", nrows = 40000)

## Use this data if you pull the repo and do not want to download the large dataset from kaggle
# df = pd.read_csv("train_2000.csv")

df = df.fillna("NA")
df["OSIdentifier"] = df["Platform"].map(str) + "_" + df["OsVer"].map(str) + "_" + df["OsBuild"].map(str)

# Create the main machine tables and other useful tables (to model relationships)
## create machine table/csv
machine_df = df[["MachineIdentifier", "Processor", "Census_ProcessorManufacturerIdentifier", "Census_ProcessorModelIdentifier", "Census_MDC2FormFactor", "HasDetections", "Firewall", "OSIdentifier", "CountryIdentifier", "OrganizationIdentifier"]]
machine_df.to_csv("machine.csv", index=False)

# create country table/csv
country_df = pd.DataFrame()
country_df["CountryIdentifier"] = df["CountryIdentifier"].unique()
country_df.to_csv("country.csv", index=False)

# create organization table/csv
org_df = pd.DataFrame()
org_df["OrganizationIdentifier"] = df["OrganizationIdentifier"].unique()
org_df.to_csv("organization.csv", index=False)

# create os table/csv
os_df = pd.DataFrame()
os_df["OSIdentifier"] = df["OSIdentifier"].unique()
os_df = pd.merge(os_df, df[["Platform", "OsVer", "OsBuild", "OSIdentifier"]], left_on = "OSIdentifier", right_on = "OSIdentifier", how = "left").groupby("OSIdentifier").first().reset_index()
os_df.to_csv("os.csv", index=False)

# create firewall table/csv
firewall_df = pd.DataFrame()
firewall_df["Firewall"] = df["Firewall"].unique()
firewall_df.to_csv("firewall.csv", index=False)

# create MalwareDetected table/csv
machine_type_df = pd.DataFrame()
machine_type_df["machine_type"] = df["Census_MDC2FormFactor"].unique()
machine_type_df.to_csv("machine_type.csv", index=False)

# create MalwareDetected table/csv
malware_df = pd.DataFrame()
malware_df["HasDetections"] = df["HasDetections"].unique()
malware_df.to_csv("malware.csv", index=False)

# output the sampled 40k dataset
# df.to_csv("train_40000.csv", index=False)

Wall time: 6.42 s
