## Using Shodan and Farsight DNSDB for research on hacker groups
- starting with [Shodan](https://developer.shodan.io/api)
- [Farsight](https://docs.dnsdb.info/) a big TODO, as of now.

In [1]:
import pandas as pd
import requests
import json
import time
from pathlib import Path
import datetime
import yaml
import sys

In [2]:
# enter auth
with open('apikeys.yaml', 'r') as api_keys:
    load_keys = yaml.safe_load(api_keys)
    SHODAN_KEY = load_keys['apikeys']['shodan']['key']

In [3]:
# setting up shodan
# for now, I'm only interested in SSL-information, so I'll use the host method
# for more: https://developer.shodan.io/api

SHODAN_URL = "https://api.shodan.io"
HOST_SEARCH = '/shodan/host/search'

# this ssl-cert was being used by APT32
# check this link for more information: https://web.br.de/interaktiv/ocean-lotus/en/
SEARCH_QUERY = 'ssl:196113228a9c7dc615a43c4431dc2bb327c43b2c'

In [4]:
def main(query): 
    
    params = {'key': SHODAN_KEY,
              'query': SEARCH_QUERY}

    resp = requests.get(SHODAN_URL + HOST_SEARCH, params=params)
    
    # 200: Request was OK!
    if resp.status_code == 200:
        results = []
        json_response = resp.json()
        
        # Shodan stores its results in a dict with they key 'matches'
        if len(json_response['matches']) == 0:

            # The request was successful, but there were no results. 
            return None

        else:    
            for match in json_response['matches']:
                results.append(match)
            return results
    
    # if there is something wrong with the request, give back the error 
    else:
        return f'There was a problem with the request: The error code is: {resp.status_code}'

In [5]:
def process_results(search_results):
    processed_results = []

    # Shodan API returns a lot of information. 
    # We just need some pieces of that.
    for result in search_results:

        dict = {}
        dict['hash']            = result['hash']
        dict['hostnames']       = result['hostnames']
        dict['ip_str']          = result['ip_str']
        dict['asn']             = result['asn']
        dict['isp']             = result['isp']
        dict['domains']         = result['domains']
        dict['timestamp']       = result['timestamp']
        dict['ssl_fingerprint'] = result['ssl']['cert']['fingerprint']['sha1']
        dict['ssl_serial']      = result['ssl']['cert']['serial']
        processed_results.append(dict)

    return processed_results

In [6]:
def rename_files():
    
    # geht the current timestamp, e.g. 2020-06-04
    time_now = datetime.datetime.today().strftime ('%Y-%m-%d')
    
    return time_now

In [23]:
def compare_dataframes(new_dataframe, old_dataframe):
    # Cycling through the new results. We'll start with the first item
    # and check the complete dataframe, one by one. Then we'll repeat.
    # Two loops are needed.
    for index_one, row_one in new_dataframe.iterrows():

        # Since we haven't found anything yet, this variable equals False
        found = False

        for index_two, row_two in old_dataframe.iterrows():

            # For now, we're only interested in changes in IP.
            # In the future, we might look at changes in ISP and/or ASN
            if row_one['ip_str'] in row_two['ip_str']:

                # since we have found a match, we had this result already. We're breaking out of the loop and moving on.
                found = True
                break

            # Since we're looping through a complete list, many of the results won't match.
            # Say 123.123.123.13 is part of the DataFrame, but in the fourth row, 
            # you'll go through the loop three times without finding a match. 
            else:
                pass

        # If we've gone through all the items and still haven't found anything, then we got a new item.
        # We'll populate the empty dataframe with the results.
        if not found:
            final_df = query_df.append(new_dataframe.iloc[index_one])
            return final_df

        else:
            return None

In [17]:
# this is what a DataFrame will look like 

test_df = pd.read_csv('testing.csv')
test_df

Unnamed: 0,hash,hostnames,ip_str,asn,isp,domains,timestamp,ssl_fingerprint,ssl_serial
0,-1846879901,[],141.98.212.23,AS206804,EstNOC,[],2020-06-27T08:09:52.961301,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
1,440567427,[],185.183.96.125,AS60117,Host Sailor Ltd.,[],2020-06-25T13:25:24.919870,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
2,-233915422,[],103.91.64.99,AS55720,Gigabit Hosting Sdn Bhd,[],2020-06-30T16:23:43.191273,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
3,1228471873,"[""li2078-235.members.linode.com""]",172.105.169.235,AS63949,Linode,"[""linode.com""]",2020-07-08T18:19:12.877844,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
4,1507128082,[],141.98.215.122,AS206804,EstNOC,[],2020-07-09T06:31:32.439329,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
5,784331945,[],204.16.247.183,AS20326,TeraSwitch Networks,[],2020-07-08T04:13:38.269750,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
6,901252180,"[""snkr.taxpayersdime.com""]",43.254.132.206,AS131447,POPIDC powered by CSLoxinfo,"[""taxpayersdime.com""]",2020-07-09T04:00:40.265991,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
7,822858718,"[""ip195.ip-198-50-191.net""]",198.50.191.195,AS16276,OVH SAS,"[""ip-198-50-191.net""]",2020-07-03T01:43:58.187197,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307
8,-1203743915,"[""li1678-152.members.linode.com""]",172.104.68.152,AS63949,Linode,"[""linode.com""]",2020-07-02T23:51:56.693956,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307


In [18]:
# start the query, process the api results, generate a Dataframe
start_search = main(SEARCH_QUERY)

# Check if we have some results to work with first.
if start_search:
    search_processed = process_results(start_search)
    new_df = pd.DataFrame(search_processed)
    new_df.head(1)

# There are no results to work with. We' creating an empty dataframe
# storing it as csv and exiting the script.
else:
    empty_df = pd.DataFrame(columns=['hash', 'hostnames', 'ip_str', 'asn', 'isp', 'domains', 'timestamp',
                                 'ssl_fingerprint', 'ssl_serial'])
    get_time = rename_files()
    empty_df.to_csv(get_time + 'empty.csv',index=False)
    sys.exit(0)

In [19]:
# Read in the results from last time's scan.
old_df = pd.read_csv('old_df.csv')
old_df

Unnamed: 0,hash,hostnames,ip_str,asn,isp,domains,timestamp,ssl_fingerprint,ssl_serial
0,-1320823844,[],172.93.165.74,AS55720,Nexeon Technologies,[],2020-10-12T11:30:38.307972,196113228a9c7dc615a43c4431dc2bb327c43b2c,-41833885383437277866864086923528812307


In [20]:
# an emtpy dataframe, that we're going to populate with our new findings
query_df = pd.DataFrame(columns=['hash', 'hostnames', 'ip_str', 'asn', 'isp', 'domains', 'timestamp',
                                 'ssl_fingerprint', 'ssl_serial'])
query_df

Unnamed: 0,hash,hostnames,ip_str,asn,isp,domains,timestamp,ssl_fingerprint,ssl_serial


In [24]:
check_results = compare_dataframes(new_df, old_df)
if check_results:
    for index, row in final_df.iterrows():
        print('There are new results: {}'.format(row['ip_str']))
else:
    print('There are no new findings.')

There are no new findings.


In [22]:
# Preparing the file for saving, we need a timestamp
get_time = rename_files()

# we're saving the new results in a timestamped-file
new_df.to_csv(get_time + '_results.csv', index=False)

# next time around, we want to work with our new results
# since we're opening up 'old_df.csv' at the beginning of 
# the script, we have to save our new results as "old_df.csv"
old_df = new_df
old_df.to_csv('old_df.csv', index=False)

In [25]:
# TODO Shodan
# each new query gets it's own folder
# ssL:111 –> str.replace(':','_') -> ssl_111
# Deal with NameError: name 'final_df' is not defined

# TODO pDNS
# errything