In [1]:
import os
import time
import copy
import json
import boto.s3
import logging
import smtplib
import psycopg2
import numpy as np
import datetime
import StringIO

from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
from avro import datafile, io, schema
from datetime import datetime, timedelta, date
from boto.s3.key import Key
from boto.s3.connection import S3Connection
from collections import defaultdict, Counter
from netaddr import IPNetwork, IPAddress

from agg_config import *

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
class SenderModelReader(object):
  def __init__(self, aws_conf, bucket_name, key):
    self.s3_conn = S3Connection(**aws_conf)
    self.bucket = self.s3_conn.get_bucket(bucket_name)
    self.key = key
  def fetch(self):
    key = self.bucket.get_key(self.key)
    data_file = StringIO.StringIO()
    key.get_file(data_file)
    reader = DataFileReader(data_file, DatumReader())
    return [(x['version'], x['threshold'], x['models']) for x in reader]

## Grab user feedback from Cousteau

In [None]:
db_conn = psycopg2.connect(host=db_conf['host'], user=db_conf['user'], database=db_conf['database'])
cursor = db_conn.cursor()
cursor.execute("select suggestion, context, organization_id, comments from user_response where suggestion_type = 'authenticity';")
data = cursor.fetchall()


In [None]:
fb_auth = {}
org_info = {}
fb_comments = {}
count = 0
for d in data:
    suggestion = json.loads(d[0])
    context = json.loads(d[1])
    oid = d[2]
    comments = d[3]
    for m in suggestion['matches']:
        if m['field'] == 'domain':
            domain = m['value'][0]
        elif m['field'] == 'ip':
            ips = m['value']
    if suggestion['input_type'] == 'ip_list':
        count += 1
        fb_auth[domain] = ips
        org_info[domain] = oid
        fb_comments[domain] = comments

print count
print len(data)

## Grab the latest Sender Model build

In [None]:
smr = SenderModelReader(aws_conf, 'agari-prod-ep-metadata','models/sendermodels-2016-08-22_02:25:57.avro')
sm = smr.fetch()
sms = sm[0][2]

In [None]:
sm_auth = defaultdict(lambda: defaultdict(float))
for s in sms:
    domain = s['domain']
    for x in s['cidrs']:
        ip = x['cidr']
        score = x['score']
        sm_auth[domain][ip] = score


In [None]:
domain = 'pacificskymortgage.com'
print fb_auth[domain]
print sm_auth[domain]
print np.mean(sm_auth[domain].values())

In [None]:
miss = 0
hit = 0
for domain in fb_auth:
    try:
        q = sm_auth[domain]
        hit += 1
    except:
        miss += 1

print 'For domains: {} hits, {} misses'.format(hit, miss)

In [None]:
fb_scores = defaultdict(lambda: defaultdict(list))
fb_ips_map = defaultdict(lambda: defaultdict(list))
for domain in fb_auth:
    fb_ips = fb_auth[domain]
    for fb_ip in fb_ips:
        ip_scores = []
        for ip in sm_auth[domain]:
            if IPAddress(ip) in IPNetwork(fb_ip): # only grab SM ips which have feedback for that domain
                fb_scores[domain][fb_ip].append(sm_auth[domain][ip])
                fb_ips_map[domain][fb_ip].append(ip)


## Individual domain-ip scores

In [None]:
individual_ip_scores = []
for d in fb_scores:
    for c in fb_scores[d]:
        individual_ip_scores += fb_scores[d][c]

In [None]:
bins = np.linspace(0.0, 1.0, 50)
plt.hist(individual_ip_scores, color='b', bins=bins, alpha=0.5)
plt.title('Individual domain-IP auth scores')
plt.show()

## CIDR-averaged scores

In [None]:
averaged_ip_scores = []
for d in fb_scores:
    for c in fb_scores[d]:
        averaged_ip_scores.append(np.mean(fb_scores[d][c]))

In [None]:
bins = np.linspace(0.0, 1.0, 50)

plt.hist(averaged_ip_scores, color='b', bins=bins, alpha=0.5)
plt.title('CIDR-averaged auth scores')
plt.show()

In [None]:
auth_cutoff = 0.3
dom_ip_to_investigate = defaultdict(list)
for d in fb_scores:
    comments = fb_comments[d]
    for c in fb_scores[d]:
        if np.mean(fb_scores[d][c]) <= auth_cutoff:
            oid = org_info[d]
            ips = fb_ips_map[d][c]
            stats = (min(fb_scores[d][c]), np.mean(fb_scores[d][c]), max(fb_scores[d][c]), len(fb_scores[d][c]))
            dom_ip_to_investigate[oid].append((d, ips) + stats + (comments,))

for o in sorted(dom_ip_to_investigate.keys()):
    print 'Org {} - {} items'.format(str(o).rjust(2), str(len(dom_ip_to_investigate[o])).rjust(3))

In [None]:
org = 54
for x in dom_ip_to_investigate[org][:]:
    print (x[0], x[1])
    #print x[1]
    print x[2:-1]
    print x[-1]
    print '\n'

In [None]:
cidrs_per_domain = [ len(x) for x in fb_scores.itervalues() ]
scores_per_cidr = []
for d in fb_scores:
    for c in fb_scores[d]:
        scores_per_cidr.append(len(fb_scores[d][c]))

# Things to improve for authenticity

Examples:

Org 1
- (u'theflyzikgroup.com', ['209.17.115.114','209.17.115.115','209.17.115.116','209.17.115.39','209.17.115.43','209.17.115.50'])
    - These are fixed - PTR neighborhood bug
- (u'cloud.sophos.com', [u'208.70.210.247'])
    - These are fixed - PTR neighborhood bug

Org 54
- (u'coloradocyclist.com', [u'198.2.138.130', u'198.2.131.34', u'198.2.182.209', u'198.2.139.204', u'198.2.183.30', u'198.2.129.201', u'198.2.183.196', u'198.2.129.196', u'198.2.129.194', u'198.2.182.130', u'198.2.182.133', u'198.2.129.93', u'198.2.130.85', u'198.2.129.99', u'198.2.183.122', u'198.2.181.74', u'198.2.130.66', u'198.2.131.16', u'198.2.182.142', u'198.2.130.91', u'198.2.131.240', u'198.2.187.223', u'198.2.183.53', u'198.2.190.234', u'198.2.183.112', u'198.2.183.119', u'198.2.139.167', u'198.2.190.16', u'198.2.129.152', u'198.2.139.154', u'198.2.190.207', u'198.2.138.147', u'198.2.129.175', u'198.2.181.223', u'198.2.190.214', u'198.2.190.215', u'198.2.181.28', u'198.2.130.188', u'198.2.138.168', u'198.2.138.164', u'198.2.130.51', u'198.2.183.9', u'198.2.130.9', u'198.2.182.181', u'198.2.183.93', u'198.2.190.44', u'198.2.181.82', u'198.2.130.4', u'198.2.129.225', u'198.2.129.223', u'198.2.138.159', u'198.2.190.173', u'198.2.138.33', u'198.2.131.162', u'198.2.183.209', u'198.2.182.97', u'198.2.138.227', u'198.2.182.99', u'198.2.139.194', u'198.2.182.7', u'198.2.129.134', u'198.2.190.244', u'198.2.131.172', u'198.2.131.173'])
    - ** Mean auth of these is 0.27. Jeez, can I just add a heuristic saying that if >= X PTR-neighborhood IPs send for a domain, it's authentic? **

Org 56
- https://ep.agari.com/messages?interval_days=54&size=200&sort_by=ts&start_date=2016-07-01&unicode_domain=valleycare.com
    - (u'valleycare.com', [u'68.232.129.206', u'68.232.135.60'])
    - MX & PTR mismatch - looks like a one-off. EP-1822
- https://ep.agari.com/receiver-ip/66.231.95.69?interval_days=60
    - (u'uhg.com', [u'66.231.95.69'])
    - ** This IP sends 10,000s of messages per day (usually) on behalf of uhg.com **
- https://ep.agari.com/messages?interval_days=30&ip=184.94.241.96&start_date=2016-07-25&unicode_domain=cerner.com
    - (u'cerner.com', [u'184.94.241.96'])
    - ** Comments: "Top two IPs are in the SPF record, with SPF record updates to sender models we will get these." - need to check SPF infra file and make sure this is true **
    - ** Also, periodicity for the IP as a whole: https://ep.agari.com/receiver-ip/184.94.241.96?interval_days=60 **
- https://ep.agari.com/receiver-domain/uhgrecruitmentservices.com?interval_days=30
    ** Holy crap, why are the IPs 169.54.226.{68|69|70} not getting picked up in SM? 18k messages each over 30 days **

Org 61
- https://ep.agari.com/receiver-ip/174.37.239.162?interval_days=60
    - (u'pacbell.net', [u'174.37.239.162'])
    - ** sends 2 message every month on the 11th (periodicity feature) **
- https://ep.agari.com/receiver-ip/208.87.208.8?interval_days=60
    - (u'united.com', [u'208.87.208.8'])
    - don't see this anymore
- https://ep.agari.com/receiver-ip/83.140.23.85?interval_days=60
    - (u'starwoodhotels.com', [u'83.140.23.85'])
    - only one message in last few months

Org 64:
- https://ep.agari.com/receiver-ip/69.63.131.201?interval_days=60&message_type=no_auth
    - (u'sru.org', [u'69.63.131.201'])
    - 6 messages over 2 days in last 60 days - ???
- https://ep.agari.com/messages?interval_days=60&ip=161.253.198.11&start_date=2016-06-24&unicode_domain=wakehealth.edu
    - (u'wakehealth.edu', [u'161.253.198.11'])
    - appears to be mailing list
- (u'acr.org', ['103.28.42.112','167.89.4.19','103.28.42.109','203.55.21.17'])
    - asked aarmstrong about these - comment is 'IP address that are part of 3rd parties with IP ranges in the Sender Inventory'
    - ** "If we've marked a known range of IP's from a 3rd party good, and another pops up in the same /24, that should be weighted so thst it's not an instant spoof (assuming same sending characteristics as the marked good neighbord IP)" - Alex **
- https://ep.agari.com/receiver-domain/mdanderson.org?interval_days=60
    - (u'mdanderson.org', [u'161.253.198.23','38.105.65.115'])
    - ** mailing lists **
- https://ep.agari.com/receiver-domain/acep.org?interval_days=60&message_type=no_auth&size=100
    - (u'acep.org', [u'198.37.147.137','50.31.33.53','198.37.147.117'])
    - ** mailing lists in same PTR neighborhood, but low volume **
- https://ep.agari.com/messages?interval_days=63&ip=67.216.225.44&start_date=2016-06-21&unicode_domain=nasci.org
    - (u'nasci.org', [u'67.216.225.44'])
    - 9 messages all one 1 day - looks pretty inauthentic
- https://ep.agari.com/messages?interval_days=60&ip=66.35.59.40&start_date=2016-06-24&unicode_domain=ochsner.org
    - (u'ochsner.org', [u'66.35.59.40'])
    - ** mailing list, single day/message **

Org 66:
- (u'identitydirect.co.uk', [u'103.245.145.220'])
    - Just a single IP, no neighborhood, low sending volume - nothing to see here

Org 67:
- (u'spencerstuart.com', [u'79.170.244.144'])
    - Doesn't show up in prod anymore
- https://ep.agari.com/messages?interval_days=60&ip=149.169.2.72&start_date=2016-06-25&unicode_domain=pps.agari
    - (u'pps.agari', [u'149.169.2.72'])
    - ** Marked as a forwarder (.edu host) -- potential forwarder IP hopping candidate **

Org 69:
- (u'ip-172-18-19-67.ec2.internal', [u'184.72.111.30']) AND (u'ip-172-18-1-73.ec2.internal', [u'52.90.100.134'])
    - ** These are EC2 instances sending alerts. They send messages to a lot of people, but only on a few days - is there potential for a breadth-type feature here? **


** - **
** Other things to note **
- When we see an absolutely new domain for the first time, and it's scored via incremental, it will get a 5.0 identity trust score. However, once we run a full model build, the reputation will be lower than this - usually around 4.3. This is because the full model build uses the consistency feature, while incremental does not. Should we change this? It looks weird for a domain to start at 5.0, drop to 4.something, and then potentially rise up again as we see more traffic.

In [None]:
print sm_auth['bankofamerica.com']['121.33.38.170']

## Grab reputation feedback from database

In [None]:
db_conn = psycopg2.connect(host=db_conf['host'], user=db_conf['user'], database=db_conf['database'])
cursor = db_conn.cursor()
cursor.execute("select suggestion, context, organization_id, comments from user_response where suggestion_type = 'reputation';")
data2 = cursor.fetchall()


In [None]:
fb_rep = {}
for x in data2:
    rule = json.loads(x[0])
    domain = rule['matches'][0]['value'][0]
    org = int(x[2])
    if rule['label'] == 'trusted':
        fb_rep[domain] = str(org)


In [None]:
orgs = Counter(fb_rep.values())
for x in sorted(orgs):
    print x, orgs[x]
print type(x)

## Grab reputation model

In [None]:
bucket_name = 'agari-prod-ep-metadata'
rep_key = 'activereputation/reputation-1472029516.15.json'
rep = json.load(S3Connection(**aws_conf).get_bucket(bucket_name).get_key(rep_key))

In [None]:
model_rep = defaultdict(lambda: defaultdict(list))
for x in rep:
    domain = x[0]
    for org in x[1]:
        model_rep[domain][str(org)] = x[1][org]

In [None]:
rep_scores = defaultdict(lambda: defaultdict(list))
for domain, org in fb_rep.iteritems():
    rep_scores[domain][str(org)] = model_rep[domain][str(org)]

In [None]:
rep_scores_hist = []
miss = 0
for domain, scores in rep_scores.iteritems():
    for org in scores:
        try:
            rep_scores_hist.append(1 - scores[org][0])
        except:
            miss += 1

print '{} scores missed'.format(miss)

In [None]:
bins = np.linspace(0.0, 1.0, 50)

plt.hist(rep_scores_hist, color='b', bins=bins, alpha=0.5)
plt.title('Reputation scores - trusted feedback')
plt.show()

In [None]:
score_reasons = []
miss = 0
for domain, scores in rep_scores.iteritems():
    for org in scores:
        try:
            if (1 - scores[org][0]) < 0.3:
                score_reasons.append((domain, org,scores[org]))
        except:
            miss += 1

print '{} scores missed\n\n'.format(miss)

for s in score_reasons:
    print s[0]
    print s[1]
    print s[2], '\n'

# Things to improve for reputation

* ** Whitelist AWS? - "ec2.internal" hdr_from domain suffix? Or a domain suffix rule for reputation? e.g. we've seen a lot of XXXX.ec2.internal domains, can we improve reputation for them all? **
* ** Removing cousin domain hit if registration age is old enough? **
* ** Should .gov domains be considered as pop-up? **
