In [72]:
APP_NAMES_EXCLUDE_LIST = ['None', 'SSH', 'SSL', 'HTTP', 'DNS', 'HTTP2', 'junos-https', 'junos-http']

In [73]:
df = spark.read.parquet('s3://mist-secorapp-production/srx-flow-events-analytics/srx-flow-events-analytics-production/dt=2020-10-28/hr=*/*')
df.count()




22492243

In [74]:
df_completed_session = df.filter('event_type == 2').persist()
df_completed_session.count()

14175454

In [75]:
df_completed_session.printSchema()

root
 |-- chassis_mac: string (nullable = true)
 |-- org_id: string (nullable = true)
 |-- site_id: string (nullable = true)
 |-- srxipaddress: string (nullable = true)
 |-- event_type: integer (nullable = true)
 |-- event_occur_time: long (nullable = true)
 |-- flow_session_id: integer (nullable = true)
 |-- src_addr: string (nullable = true)
 |-- dst_addr: string (nullable = true)
 |-- src_port: integer (nullable = true)
 |-- dst_port: integer (nullable = true)
 |-- protocol_id: integer (nullable = true)
 |-- session_close_reason: string (nullable = true)
 |-- session_total_time: integer (nullable = true)
 |-- secuirty_policy_name: string (nullable = true)
 |-- src_zone_name: string (nullable = true)
 |-- dst_zone_name: string (nullable = true)
 |-- out_interface_name: string (nullable = true)
 |-- routing_instance: string (nullable = true)
 |-- pkts_from_client: long (nullable = true)
 |-- bytes_from_client: long (nullable = true)
 |-- pkts_from_server: long (nullable = true)
 |-- b

In [76]:
# For each srx box, how many application sessions
srx_session_counts = df_completed_session.groupBy('chassis_mac').count().collect()
for r in srx_session_counts:
    print('srx_mac {}: session_count {}'.format(r['chassis_mac'], r['count']))

srx_mac 5800bba79000: session_count 25863
srx_mac d8b122d38781: session_count 14325
srx_mac d0dd49ebba7a: session_count 18588
srx_mac fc334262af00: session_count 3850
srx_mac 4c96144a3200: session_count 20086
srx_mac 4c96144ff300: session_count 422
srx_mac 30b64f25ce40: session_count 129836
srx_mac 4c9614067d00: session_count 21168
srx_mac fc33426df201: session_count 29218
srx_mac f07cc745e489: session_count 133302
srx_mac fc334263c380: session_count 871947
srx_mac c8e7f0c878e9: session_count 63213
srx_mac ec13dbdcd600: session_count 554
srx_mac d8b122ae8b00: session_count 2019
srx_mac 4c9614b2b000: session_count 21453
srx_mac 4c9614abae00: session_count 21474
srx_mac 0c81262d7e84: session_count 83984
srx_mac fc3342640600: session_count 448083
srx_mac 4c961411b900: session_count 284517
srx_mac 94f7ad22c581: session_count 2909
srx_mac 5800bbdca0bc: session_count 139
srx_mac 5800bbdc5fbc: session_count 10648
srx_mac 4c9614334100: session_count 20129
srx_mac 0c81262dfa04: session_count 73

In [77]:
# For each srx box, how many unique src IPs

srx_src_ip_count = df_completed_session.select('chassis_mac', 'src_addr').dropDuplicates().groupBy('chassis_mac').count().collect()

for r in srx_src_ip_count:
    print('srx_mac {}: src_addr_count {}'.format(r['chassis_mac'], r['count']))

srx_mac 5800bba79000: src_addr_count 5
srx_mac d8b122d38781: src_addr_count 7
srx_mac d0dd49ebba7a: src_addr_count 23
srx_mac fc334262af00: src_addr_count 4
srx_mac 4c96144a3200: src_addr_count 1
srx_mac 4c96144ff300: src_addr_count 1
srx_mac 30b64f25ce40: src_addr_count 20
srx_mac fc33426df201: src_addr_count 12
srx_mac 4c9614067d00: src_addr_count 1
srx_mac f07cc745e489: src_addr_count 75
srx_mac fc334263c380: src_addr_count 80860
srx_mac ec13dbdcd600: src_addr_count 41
srx_mac c8e7f0c878e9: src_addr_count 16
srx_mac d8b122ae8b00: src_addr_count 6
srx_mac 4c9614b2b000: src_addr_count 1
srx_mac 4c9614abae00: src_addr_count 1
srx_mac fc3342640600: src_addr_count 36
srx_mac 0c81262d7e84: src_addr_count 456
srx_mac 4c961411b900: src_addr_count 1
srx_mac 94f7ad22c581: src_addr_count 261
srx_mac 5800bbdca0bc: src_addr_count 3
srx_mac 5800bbdc5fbc: src_addr_count 4
srx_mac 4c9614334100: src_addr_count 1
srx_mac 0c81262dfa04: src_addr_count 2
srx_mac 2c2131521680: src_addr_count 483
srx_mac 

In [78]:

# For each srx box, how many unique src IPs
from pyspark.sql import functions as F, Window

# window_def = Window.partitionBy(F.col('site_id'), F.col('chassis_mac'), ).orderBy(F.col('rssi').desc())

srx_app_stats_df = df_completed_session.select('org_id',
                                               'site_id',
                                               'chassis_mac',
                                               'bytes_from_client',
                                               'bytes_from_server',
                                               'src_addr',
                                               'normalized_app',
                                               'session_total_time')\
.filter( ~F.col('normalized_app').isin(APP_NAMES_EXCLUDE_LIST))\
.groupby(['org_id', 'site_id', 'normalized_app'])\
.agg(F.avg('bytes_from_client').alias('avg_bytes_from_client'), 
     F.avg('bytes_from_server').alias('avg_bytes_from_server'),
     F.approxCountDistinct('src_addr').alias('src_addr_count'),
     F.count('site_id').alias('session_count'),
     F.avg('session_total_time').alias('avg_session_time'))

srx_app_stats_df.filter('org_id == "fe0ea1e2-2d95-43f2-9c6b-49d57eb01363" AND site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('normalized_app', 
        'avg_bytes_from_client', 
        'avg_bytes_from_server', 
        'src_addr_count',
        'session_count',
        'avg_session_time').show()


+--------------------+---------------------+---------------------+--------------+-------------+------------------+
|      normalized_app|avg_bytes_from_client|avg_bytes_from_server|src_addr_count|session_count|  avg_session_time|
+--------------------+---------------------+---------------------+--------------+-------------+------------------+
|              SMAATO|               2914.0|     8789.42857142857|             3|            7|54.857142857142854|
|       GOOGLE-UPDATE|    3149.076923076923|   163566.92307692306|             4|           13|152.15384615384616|
|    GOOGLE-APPENGINE|   1954.7777777777778|   3713.1111111111113|             3|            9|19.666666666666668|
|                STUN|                120.0|                 60.0|             1|            2|              66.0|
|     FACEBOOK-ACCESS|     21628.5105377907|   1018894.4672965116|            10|         2752| 88.97093023255815|
|               IMGUR|   2500.4285714285716|    8190.714285714285|             3

In [79]:

stats = srx_app_stats_df.groupby(['org_id', 'site_id'])\
.agg(
      F.stddev_pop("avg_bytes_from_client").alias("sd_bytes_from_client"), 
      F.avg("avg_bytes_from_client").alias("av_bytes_from_client"),
    
      F.stddev_pop("avg_bytes_from_server").alias("sd_bytes_from_server"), 
      F.avg("avg_bytes_from_server").alias("av_bytes_from_server"),
    
      F.stddev_pop("src_addr_count").alias("sd_src_addr_count"), 
      F.avg("src_addr_count").alias("av_src_addr_count"),
    
      F.stddev_pop("session_count").alias("sd_session_count"), 
      F.avg("session_count").alias("av_session_count"),
    
      F.stddev_pop("avg_session_time").alias("sd_session_time"), 
      F.avg("avg_session_time").alias("av_session_time"),
).persist()
stats.filter('org_id == "fe0ea1e2-2d95-43f2-9c6b-49d57eb01363" AND site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('sd_bytes_from_client', 'av_bytes_from_client',
        'sd_bytes_from_server', 'av_bytes_from_server',
        'sd_src_addr_count',    'av_src_addr_count',
        'sd_session_count',     'av_session_count',
        'sd_session_time',      'av_session_time').show()




+--------------------+--------------------+--------------------+--------------------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+
|sd_bytes_from_client|av_bytes_from_client|sd_bytes_from_server|av_bytes_from_server|sd_src_addr_count| av_src_addr_count| sd_session_count| av_session_count|  sd_session_time|   av_session_time|
+--------------------+--------------------+--------------------+--------------------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+
|  1723460.2444405905|  197202.67735670044| 4.043639221119496E7|  2924593.7478275336|2.729066889941449|3.3656387665198237|907.8361653378562|263.8546255506608|1085.602558241837|209.12962997140698|
+--------------------+--------------------+--------------------+--------------------+-----------------+------------------+-----------------+-----------------+-----------------+------------------+

In [80]:
marksColumns = [F.col('z_score_bfc'), 
                F.col('z_score_bfs'),
                F.col('z_score_sac'),
                F.col('z_score_sc'),
                F.col('z_score_st'),]


final_df = srx_app_stats_df.join(F.broadcast(stats), ['org_id', 'site_id'])\
.select('org_id', 'site_id', 'normalized_app',
        ((F.col('avg_bytes_from_client')-F.col('av_bytes_from_client'))/F.col('sd_bytes_from_client')).alias('z_score_bfc'),
        ((F.col('avg_bytes_from_server')-F.col('av_bytes_from_server'))/F.col('sd_bytes_from_server')).alias('z_score_bfs'),
        ((F.col('src_addr_count')-F.col('av_src_addr_count'))/F.col('sd_src_addr_count')).alias('z_score_sac'),
        ((F.col('session_count')-F.col('av_session_count'))/F.col('sd_session_count')).alias('z_score_sc'),
        ((F.col('avg_session_time')-F.col('av_session_time'))/F.col('sd_session_time')).alias('z_score_st'))\
.withColumn('avg_z_score', sum(x for x in marksColumns)/len(marksColumns))
            
final_df.filter('org_id == "fe0ea1e2-2d95-43f2-9c6b-49d57eb01363" AND site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('normalized_app', 'avg_z_score').orderBy('avg_z_score', ascending=False).show()




+--------------------+------------------+
|      normalized_app|       avg_z_score|
+--------------------+------------------+
|      IOS-OTA-UPDATE| 4.235249694769532|
|              JABBER|2.7054233272625883|
|          GOOGLE-GEN| 2.553222440710667|
|                 GCS|2.4845102518729716|
|            SNAPCHAT|2.0211078936998845|
|      APPLE-FACETIME| 1.208462645539633|
|              AMAZON|1.1429587256747988|
|     FACEBOOK-ACCESS| 0.982411468039831|
|          APPLE-PUSH|0.8130407936823831|
|              ICLOUD|0.7913855484232588|
|      NETFLIX-STREAM|0.7723771899915169|
|       GOOGLE-STATIC|0.7560705454096723|
|APPLE-IOS-UPDATE-SSL|0.7485653762810897|
|           ICMP-ECHO|0.7445356297950931|
|          GOOGLE-ADS|0.7205847619268234|
|             YOUTUBE|0.7052678192870554|
|          GOOGLE-API|0.5965251453690019|
|           MICROSOFT|0.5789933719637672|
|             OUTLOOK|0.5585192748777474|
|           INSTAGRAM|0.5398477193113699|
+--------------------+------------

In [81]:
window_def = Window.partitionBy(F.col('org_id'), F.col('site_id'), ).orderBy(F.col('avg_z_score').desc())

site_top_app_df = final_df.withColumn('rank', F.rank().over(window_def)).filter('rank <= 10').persist()
site_top_app_df.select('site_id', 'normalized_app', 'avg_z_score', 'rank').show(80)
site_top_app_df.count()


+--------------------+--------------------+--------------------+----+
|             site_id|      normalized_app|         avg_z_score|rank|
+--------------------+--------------------+--------------------+----+
|99a28ff4-3d06-476...|       junos-ike-nat|   4.378464715487214|   1|
|99a28ff4-3d06-476...|           ICMP-ECHO|  2.4790751275470058|   2|
|99a28ff4-3d06-476...|                 IKE|  1.7765438281834858|   3|
|99a28ff4-3d06-476...|                 GCM|   1.374975357633944|   4|
|99a28ff4-3d06-476...|          GOOGLE-GEN|  0.7020221742244768|   5|
|99a28ff4-3d06-476...|       GOOGLE-STATIC|  0.6929262199829581|   6|
|99a28ff4-3d06-476...|          GOOGLE-API|  0.6496831386722877|   7|
|99a28ff4-3d06-476...|          APPLE-PUSH|  0.5031231178087767|   8|
|99a28ff4-3d06-476...|             YOUTUBE|  0.4131054575365168|   9|
|99a28ff4-3d06-476...|ANDROID-MARKETPLA...|  0.3763857932229509|  10|
|33584048-90e4-40d...|          APPLE-PUSH|  0.8025754386006965|   1|
|33584048-90e4-40d..