# VPC Flow Analysis

## Dependencies

In [161]:
import csv
import json
from os import path
import pandas as pd
import requests

# AWS IP Address Ranges

AWS IP Address ranges are publicly available at this address: https://ip-ranges.amazonaws.com/ip-ranges.json

These ranges may change in time. Notifications are available to keep them up to date. See more here: https://docs.aws.amazon.com/general/latest/gr/aws-ip-ranges.html#subscribe-notifications

AWS IP Ranges are only loaded the first time this notebook is executed.

In [154]:
def loadAwsIpRanges():
    json_path = "aws_ip_ranges.json"
    awsIpRangesExists = path.exists(json_path)
    if not awsIpRangesExists:
        awsIpRanges = requests.get("https://ip-ranges.amazonaws.com/ip-ranges.json")
    else:
        with open(json_path, "r") as fp:
            awsIpRanges = json.load(fp)
    return awsIpRanges

In [288]:
def extractCidrParts(df):
    cidrParts = df["ip_prefix"].str.split("/").str
    networkAddress = cidrParts.get(0)
    subnetMaskLength = cidrParts.get(1).astype(int)
    df["networkAddress"] = networkAddress
    df["subnetMaskLength"] = subnetMaskLength
    return df

In [366]:
def loadVpcFlowLogs():
    records = []
    with open("vpc_flow_logs.csv", "r") as fp:
        csvReader = csv.reader(fp)
        next(csvReader)
        for row in csvReader:
            records.append(row)
    return records

In [316]:
def map2NetworkAddress(subnetMaskLength, ipAddress):
    ipAddressOctects = [int(octect) for octect in ipAddress.split(".")]
    maskOctects = subnetMaskOctects(subnetMaskLength)
    networkAddressOctects = []
    for i in range(4):
        networkAddressOctects.append(str(ipAddressOctects[i] & maskOctects[i]))
    return ".".join(networkAddressOctects)
    

In [317]:
def subnetMaskOctects(subnetMaskLength):
    octects = []
    tmp = subnetMaskLength
    while tmp > 0:
        if tmp >= 8:
            octects.append((255 << 0) & 255)
        else:
            octects.append((255 << 8 - tmp) & 255)
        tmp -= 8
    while len(octects) < 4:
        octects.append(0)
    return octects

In [368]:
vpcFlowLogs = loadVpcFlowLogs()

In [319]:
awsIpRangesDf = pd.DataFrame.from_dict(awsIpRanges["prefixes"])

In [363]:
awsIpRangesDf.pipe(extractCidrParts).head()

Unnamed: 0,ip_prefix,region,service,network_border_group,networkAddress,subnetMaskLength
0,3.2.34.0/26,af-south-1,AMAZON,af-south-1,3.2.34.0,26
1,3.5.140.0/22,ap-northeast-2,AMAZON,ap-northeast-2,3.5.140.0,22
2,13.34.37.64/27,ap-southeast-4,AMAZON,ap-southeast-4,13.34.37.64,27
3,13.34.65.64/27,il-central-1,AMAZON,il-central-1,13.34.65.64,27
4,13.34.66.0/27,us-east-1,AMAZON,us-east-1,13.34.66.0,27


In [364]:
def classifyIpAddress(ipAddress, awsIpRangesDf):
    matchedServicesDf = awsIpRangesDf[ awsIpRangesDf["networkAddress"] == awsIpRangesDf["subnetMaskLength"].apply(map2NetworkAddress, args=[ipAddress,])]
    return ",".join(matchedServicesDf["region"] + "_" + matchedServicesDf["service"] + "_" + matchedServicesDf["ip_prefix"])

In [369]:
classifiedSourceIps = []
classifiedDestinationIps = []
for record in vpcFlowLogs:
    sourceIp = record[0]
    sourceIpClassification = classifyIpAddress(sourceIp, awsIpRangesDf)
    classifiedSourceIps.append((sourceIp, sourceIpClassification))
    destinationIp = record[1]
    destinationIpClassification = classifyIpAddress(destinationIp, awsIpRangesDf)
    classifiedDestinationIps.append((destinationIp, destinationIpClassification))

for sourceIp, classification in classifiedSourceIps:
    print(f"{sourceIp},{classification}")

for destinationIp, classification in classifiedDestinationIps:
    print(f"{destinationIp},{classification}")

10.0.0.6,
3.2.34.5,af-south-1_AMAZON_3.2.34.0/26,af-south-1_EC2_3.2.34.0/26
172.32.0.10,
10.1.0.67,
172.31.0.3,
172.31.0.15,
13.34.37.75,ap-southeast-4_AMAZON_13.34.37.64/27
172.31.0.3,
