In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib as mpl
import geoip2.database

In [2]:
files = [ "./data/cmds_sequence_2016-07-01.csv", "./data/cmds_sequence_2016-07-02.csv", "./data/cmds_sequence_2016-07-03.csv", "./data/cmds_sequence_2016-07-04.csv", 
    "./data/cmds_sequence_2016-07-05.csv", "./data/cmds_sequence_2016-07-06.csv", "./data/cmds_sequence_2016-07-07.csv", "./data/cmds_sequence_2016-07-08.csv", 
    "./data/cmds_sequence_2016-07-09.csv", "./data/cmds_sequence_2016-07-10.csv", "./data/cmds_sequence_2016-07-11.csv", "./data/cmds_sequence_2016-07-12.csv", 
    "./data/cmds_sequence_2016-07-13.csv", "./data/cmds_sequence_2016-07-14.csv", "./data/cmds_sequence_2016-07-15.csv", "./data/cmds_sequence_2016-07-16.csv", 
    "./data/cmds_sequence_2016-07-17.csv", "./data/cmds_sequence_2016-07-18.csv", "./data/cmds_sequence_2016-07-19.csv", "./data/cmds_sequence_2016-07-20.csv", 
    "./data/cmds_sequence_2016-07-21.csv", "./data/cmds_sequence_2016-07-22.csv", "./data/cmds_sequence_2016-07-23.csv", "./data/cmds_sequence_2016-07-24.csv", 
    "./data/cmds_sequence_2016-07-25.csv", "./data/cmds_sequence_2016-07-26.csv", "./data/cmds_sequence_2016-07-27.csv", "./data/cmds_sequence_2016-07-28.csv", 
    "./data/cmds_sequence_2016-07-29.csv", "./data/cmds_sequence_2016-07-30.csv", "./data/cmds_sequence_2016-07-31.csv", "./data/cmds_sequence_2016-08-29.csv", 
    "./data/cmds_sequence_2016-08-30.csv", "./data/cmds_sequence_2016-08-31.csv", "./data/cmds_sequence_2016-09-01.csv", "./data/cmds_sequence_2016-09-02.csv", 
    "./data/cmds_sequence_2016-09-03.csv", "./data/cmds_sequence_2016-09-04.csv", "./data/cmds_sequence_2016-09-05.csv", "./data/cmds_sequence_2016-09-06.csv", 
    "./data/cmds_sequence_2016-09-07.csv", "./data/cmds_sequence_2016-09-08.csv", "./data/cmds_sequence_2016-09-09.csv", "./data/cmds_sequence_2016-09-10.csv", 
    "./data/cmds_sequence_2016-09-11.csv", "./data/cmds_sequence_2016-09-12.csv", "./data/cmds_sequence_2016-09-13.csv"
]

reader = geoip2.database.Reader('./data/GeoLite2-Country.mmdb')
ip_mapping_df = pd.read_csv('./data/ip_addresses_country.csv')
ip_mapping = ip_mapping_df['IP addresses']
ip_mapping.index = ip_mapping_df['ISO country code']

def get_country_code(ip):
    try:
        return reader.country(ip).country.iso_code
    except geoip2.errors.AddressNotFoundError:
        return "Not in Database"
    

def read_data(path: str) -> pd.DataFrame:
    """ Reads csv at `path` into a pandas dataframe """
    df = pd.read_csv(path, 
                   sep="\$\$",
                   header=None, 
                   names=["Timestamp", "Src IP", "Src Port", "Dest IP", "Dest Port", "Commandlist"], 
                   index_col=False)
    df['Commandlist'] = df['Commandlist'].apply(parse_command_string)
    df['country'] = df['Src IP'].apply(get_country_code)
    return df

def login_attempts(data: pd.DataFrame) -> pd.Series:
    """ Returns amount of login attempts the honeypot got for a day """
    return len(data)

def dest_ip_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns value counts of destination IP addresses """
    return data["Dest IP"].value_counts()

def dest_port_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns value counts of destination ports """
    return data["Dest Port"].value_counts()

def src_ip_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns value counts of source IP addresses """
    return data["Src IP"].value_counts()

def src_port_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns value counts of source ports """
    return data["Src Port"].value_counts()

def combine_value_counts(s1: pd.Series, s2: pd.Series) -> pd.Series:
    """ 
    Takes two pandas series (value_counts), and combines them
    by adding the values of keys that are present in both series
    """
    return s1.combine(s2, lambda x, y: x + y, fill_value=0)

def parse_command_string(commandstring: str) -> list:
    """ Parses a commandstring from the dataset into a python list """
    # Remove square brackets and first/last apostrophe
    commandstring = commandstring[2:-2]
    
    # Split on ', '
    commandstring = commandstring.split(r"', '")
    
    return commandstring

def login_credentials(commandlist: list) -> str:
    """ 
    Takes a pandas series with commands, and gives back `username:password`, where
    `username` is the first argument of the commandlist, and `password` is the second
    argument of the command list. This is because usually the first two arguments are 
    the login/pass. This function returns "invalid" if it detects an invalid user/pass combination
    """
    # If commandlist does not have enough arguments, return ""
    if len(commandlist) < 2:
        return "invalid"
    
    # If the user/pass are longer than 10 chars, they are usually not usernames/password, so return ""
    if len(commandlist[0]) > 10:
        return "invalid"
    if len(commandlist[1]) > 10:
        return "invalid"
    
    # Return "username:password"
    return "{}:{}".format(commandlist[0], commandlist[1])

def command_frequencies(commandlist: list) -> pd.Series:
    """ 
    Takes a pandas series with commands, and computes how much each command is used
    in the command list.
    """
    return pd.Series(commandlist).value_counts()
    
def command_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns value counts of issued commands """
    result = []
    # Loop throuh all commandlist entries in the datatable, and append all commands to the result list
    for i in range(len(data['Commandlist'])):
        result.extend(data['Commandlist'][i])
    
    # Turn the result list in to a pandas series, and compute the value counts
    return pd.Series(result).value_counts()

def login_counts(data: pd.DataFrame) -> pd.Series:
    """ Returns values counts of login attempts """
    result = []
    # Loop throuh all commandlist entries in the datatable, and append all login credentials to the result list
    for i in range(len(data['Commandlist'])):
        result.append(login_credentials(data['Commandlist'][i]))
        
    # Turn the result list in to a pandas series, and compute the value counts
    return pd.Series(result).value_counts()

def country_counts(data: pd.DataFrame) -> pd.Series:
    """ Return value counts of src IP countries """
    return data['country'].value_counts()

In [5]:
data = read_data(files[0])
print("Login attempts", login_attempts(data))

Index(['AF', 'AX', 'AL', 'DZ', 'AS', 'AD', 'AO', 'AI', 'AQ', 'AG',
       ...
       'UZ', 'VU', 'VA', 'VE', 'VN', 'WF', 'EH', 'YF', 'ZM', 'ZW'],
      dtype='object', name='ISO country code', length=248)

In [6]:
logins = []
src_ips = pd.Series()
src_ports = pd.Series()
dst_ips = pd.Series()
dst_ports = pd.Series()
creds = pd.Series()
commands = pd.Series()
countries = pd.Series()

country_activity = {k: [] for k in ip_mapping.index.unique()}

top_8_credentials = [
    "root:xc3511",
    "P!:root",
    "root:root",
    "admin:1234",
    "P!:admin",
    "root:vizxv",
    "root:admin",
    "admin:admin"
]

daily_credential_counts = {k: [] for k in top_8_credentials}

for f in files:
    print("Reading file:", f)
    
    # Read data
    data = read_data(f)
    
#     logins.append(login_attempts(data))                                  # append login attempts of this day
    
#     src_ips = combine_value_counts(src_ips, src_ip_counts(data))         # append src ip counts of this day
#     src_ports = combine_value_counts(src_ports, src_port_counts(data))   # append src port counts of this day
    
#     dst_ips = combine_value_counts(dst_ips, dest_ip_counts(data))        # append dst ip counts of this day
#     dst_ports = combine_value_counts(dst_ports, dest_port_counts(data))  # append dst port counts of this day
#     login_count = login_counts(data)
#     creds = combine_value_counts(creds, login_count)              # append login counts of this day
#     commands = combine_value_counts(commands, command_counts(data)).sort_values(ascending=False).head(250)  # append command counts of this day
#     countries = combine_value_counts(countries, country_counts(data))
#     country_activity.append(country_counts(data)[country_activity_label])
    cc = country_counts(data)
    for country in country_activity.keys():
        if country in cc:
            country_activity[country].append(cc[country])
        else:
            country_activity[country].append(0)
    

Reading file: ./data/cmds_sequence_2016-07-01.csv




Reading file: ./data/cmds_sequence_2016-07-02.csv
Reading file: ./data/cmds_sequence_2016-07-03.csv
Reading file: ./data/cmds_sequence_2016-07-04.csv
Reading file: ./data/cmds_sequence_2016-07-05.csv
Reading file: ./data/cmds_sequence_2016-07-06.csv
Reading file: ./data/cmds_sequence_2016-07-07.csv
Reading file: ./data/cmds_sequence_2016-07-08.csv
Reading file: ./data/cmds_sequence_2016-07-09.csv
Reading file: ./data/cmds_sequence_2016-07-10.csv
Reading file: ./data/cmds_sequence_2016-07-11.csv
Reading file: ./data/cmds_sequence_2016-07-12.csv
Reading file: ./data/cmds_sequence_2016-07-13.csv
Reading file: ./data/cmds_sequence_2016-07-14.csv
Reading file: ./data/cmds_sequence_2016-07-15.csv
Reading file: ./data/cmds_sequence_2016-07-16.csv
Reading file: ./data/cmds_sequence_2016-07-17.csv
Reading file: ./data/cmds_sequence_2016-07-18.csv
Reading file: ./data/cmds_sequence_2016-07-19.csv
Reading file: ./data/cmds_sequence_2016-07-20.csv
Reading file: ./data/cmds_sequence_2016-07-21.csv


In [7]:
country_activity

{'AF': [8,
  14,
  19,
  17,
  12,
  11,
  3,
  8,
  14,
  14,
  0,
  4,
  5,
  3,
  8,
  6,
  19,
  7,
  4,
  5,
  4,
  9,
  8,
  6,
  3,
  12,
  4,
  2,
  10,
  6,
  2,
  274,
  255,
  276,
  232,
  195,
  101,
  301,
  43,
  5,
  6,
  19,
  16,
  4,
  0,
  2,
  9],
 'AX': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'AL': [49,
  58,
  46,
  49,
  38,
  30,
  15,
  19,
  144,
  9,
  19,
  24,
  22,
  40,
  28,
  30,
  156,
  13,
  140,
  14,
  10,
  40,
  116,
  28,
  141,
  14,
  242,
  14,
  16,
  8,
  9,
  260,
  285,
  111,
  190,
  183,
  42,
  254,
  42,
  23,
  78,
  90,
  110,
  49,
  1,
  26,
  32],
 'DZ': [23,
  18,
  23,
  17,
  17,
  13,
  8,
  11,
  10,
  28,
  114,
  145,
  11,
  12,
  9,
  15,
  135,
  12,
  8,
  3,
  8,
  8,
  10,
  16,
  15,
  15,
  9,
  11,
  15,
  7,
  5,
  120