In [96]:
APP_NAMES_EXCLUDE_LIST = ['None', 'SSH', 'SSL', 'HTTP', 'DNS', 'HTTP2', 'junos-https', 'junos-http']

In [97]:
df = spark.read.parquet('s3://mist-secorapp-production/srx-flow-events-analytics/srx-flow-events-analytics-production/dt=2020-10-28/hr=*/*')
df.count()




22492243

In [98]:
df_completed_session = df.filter('event_type == 2').persist()
df_completed_session.count()

14175454

In [99]:
df_completed_session.printSchema()

root
 |-- chassis_mac: string (nullable = true)
 |-- org_id: string (nullable = true)
 |-- site_id: string (nullable = true)
 |-- srxipaddress: string (nullable = true)
 |-- event_type: integer (nullable = true)
 |-- event_occur_time: long (nullable = true)
 |-- flow_session_id: integer (nullable = true)
 |-- src_addr: string (nullable = true)
 |-- dst_addr: string (nullable = true)
 |-- src_port: integer (nullable = true)
 |-- dst_port: integer (nullable = true)
 |-- protocol_id: integer (nullable = true)
 |-- session_close_reason: string (nullable = true)
 |-- session_total_time: integer (nullable = true)
 |-- secuirty_policy_name: string (nullable = true)
 |-- src_zone_name: string (nullable = true)
 |-- dst_zone_name: string (nullable = true)
 |-- out_interface_name: string (nullable = true)
 |-- routing_instance: string (nullable = true)
 |-- pkts_from_client: long (nullable = true)
 |-- bytes_from_client: long (nullable = true)
 |-- pkts_from_server: long (nullable = true)
 |-- b

In [100]:
# For each srx box, how many application sessions
df_completed_session.groupBy('chassis_mac').count().show()


+------------+------+
| chassis_mac| count|
+------------+------+
|5800bba79000| 25863|
|d8b122d38781| 14325|
|d0dd49ebba7a| 18588|
|fc334262af00|  3850|
|4c96144a3200| 20086|
|4c96144ff300|   422|
|30b64f25ce40|129836|
|4c9614067d00| 21168|
|fc33426df201| 29218|
|f07cc745e489|133302|
|c8e7f0c878e9| 63213|
|fc334263c380|871947|
|ec13dbdcd600|   554|
|d8b122ae8b00|  2019|
|4c9614b2b000| 21453|
|4c9614abae00| 21474|
|fc3342640600|448083|
|0c81262d7e84| 83984|
|4c961411b900|284517|
|94f7ad22c581|  2909|
+------------+------+
only showing top 20 rows

In [101]:
# For each srx box, how many unique src IPs

df_completed_session.select('chassis_mac', 'src_addr').dropDuplicates().groupBy('chassis_mac').count().show()


+------------+-----+
| chassis_mac|count|
+------------+-----+
|5800bba79000|    5|
|d8b122d38781|    7|
|d0dd49ebba7a|   23|
|fc334262af00|    4|
|4c96144a3200|    1|
|4c96144ff300|    1|
|30b64f25ce40|   20|
|fc33426df201|   12|
|4c9614067d00|    1|
|f07cc745e489|   75|
|fc334263c380|80860|
|ec13dbdcd600|   41|
|c8e7f0c878e9|   16|
|d8b122ae8b00|    6|
|4c9614b2b000|    1|
|4c9614abae00|    1|
|0c81262d7e84|  456|
|fc3342640600|   36|
|4c961411b900|    1|
|94f7ad22c581|  261|
+------------+-----+
only showing top 20 rows

In [102]:

from pyspark.sql import functions as F, Window

#  First exclude those applications in APP_NAMES_EXCLUDE_LIST
#  Features used to determin the top applications: 
#      'bytes_from_client', 
#      'bytes_from_server',
#      number of unique src_addr,
#      number of sessions,
#      'session_total_time'

srx_app_stats_df = df_completed_session.select('org_id',
                                               'site_id',
                                               'chassis_mac',
                                               'bytes_from_client',
                                               'bytes_from_server',
                                               'src_addr',
                                               'normalized_app',
                                               'session_total_time')\
.filter( ~F.col('normalized_app').isin(APP_NAMES_EXCLUDE_LIST))\
.groupby(['org_id', 'site_id', 'normalized_app'])\
.agg(F.avg('bytes_from_client').alias('avg_bytes_from_client'), 
     F.avg('bytes_from_server').alias('avg_bytes_from_server'),
     F.approxCountDistinct('src_addr').alias('src_addr_count'),
     F.count('site_id').alias('session_count'),
     F.avg('session_total_time').alias('avg_session_time'))

# let's check the numbers for one site
srx_app_stats_df.filter('site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('normalized_app', 
        'avg_bytes_from_client', 
        'avg_bytes_from_server', 
        'src_addr_count',
        'session_count',
        'avg_session_time').show()


+--------------------+---------------------+---------------------+--------------+-------------+------------------+
|      normalized_app|avg_bytes_from_client|avg_bytes_from_server|src_addr_count|session_count|  avg_session_time|
+--------------------+---------------------+---------------------+--------------+-------------+------------------+
|              SMAATO|               2914.0|     8789.42857142857|             3|            7|54.857142857142854|
|    GOOGLE-APPENGINE|   1954.7777777777778|   3713.1111111111113|             3|            9|19.666666666666668|
|       GOOGLE-UPDATE|    3149.076923076923|   163566.92307692306|             4|           13|152.15384615384616|
|                STUN|                120.0|                 60.0|             1|            2|              66.0|
|     FACEBOOK-ACCESS|     21628.5105377907|   1018894.4672965116|            10|         2752| 88.97093023255815|
|               IMGUR|   2500.4285714285716|    8190.714285714285|             3

In [103]:

stats = srx_app_stats_df.groupby(['org_id', 'site_id'])\
.agg(
      F.stddev_pop("avg_bytes_from_client").alias("sd_bytes_from_client"), 
      F.avg("avg_bytes_from_client").alias("av_bytes_from_client"),
    
      F.stddev_pop("avg_bytes_from_server").alias("sd_bytes_from_server"), 
      F.avg("avg_bytes_from_server").alias("av_bytes_from_server"),
    
      F.stddev_pop("src_addr_count").alias("sd_src_addr_count"), 
      F.avg("src_addr_count").alias("av_src_addr_count"),
    
      F.stddev_pop("session_count").alias("sd_session_count"), 
      F.avg("session_count").alias("av_session_count"),
    
      F.stddev_pop("avg_session_time").alias("sd_session_time"), 
      F.avg("avg_session_time").alias("av_session_time"),
).persist()

# check the data
stats.filter('site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('sd_bytes_from_client', 'av_bytes_from_client',
        'sd_bytes_from_server', 'av_bytes_from_server',
        'sd_src_addr_count',    'av_src_addr_count',
        'sd_session_count',     'av_session_count',
        'sd_session_time',      'av_session_time').show()


+--------------------+--------------------+--------------------+--------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+
|sd_bytes_from_client|av_bytes_from_client|sd_bytes_from_server|av_bytes_from_server| sd_src_addr_count| av_src_addr_count| sd_session_count| av_session_count|   sd_session_time|   av_session_time|
+--------------------+--------------------+--------------------+--------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+
|  1719725.8440579223|  196338.65627567435| 4.034808129074061E7|  2911766.8804662726|2.8896952375142484|3.4298245614035086|906.4338496927851|266.0263157894737|1083.2623148458665|208.48841583260298|
+--------------------+--------------------+--------------------+--------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+

In [104]:
marksColumns = [F.col('z_score_bfc'), 
                F.col('z_score_bfs'),
                F.col('z_score_sac'),
                F.col('z_score_sc'),
                F.col('z_score_st'),]


final_df = srx_app_stats_df.join(F.broadcast(stats), ['org_id', 'site_id'])\
.select('org_id', 'site_id', 'normalized_app',
        ((F.col('avg_bytes_from_client')-F.col('av_bytes_from_client'))/F.col('sd_bytes_from_client')).alias('z_score_bfc'),
        ((F.col('avg_bytes_from_server')-F.col('av_bytes_from_server'))/F.col('sd_bytes_from_server')).alias('z_score_bfs'),
        ((F.col('src_addr_count')-F.col('av_src_addr_count'))/F.col('sd_src_addr_count')).alias('z_score_sac'),
        ((F.col('session_count')-F.col('av_session_count'))/F.col('sd_session_count')).alias('z_score_sc'),
        ((F.col('avg_session_time')-F.col('av_session_time'))/F.col('sd_session_time')).alias('z_score_st'))\
.withColumn('avg_z_score', sum(x for x in marksColumns)/len(marksColumns))
  
# check the data    
final_df.filter('site_id == "494a84e3-5ca8-4ff6-ba66-07a4f59b8ec0"')\
.select('normalized_app', 'avg_z_score').orderBy('avg_z_score', ascending=False).show()




+--------------------+------------------+
|      normalized_app|       avg_z_score|
+--------------------+------------------+
|      IOS-OTA-UPDATE| 4.245675531881504|
|              JABBER| 2.712439512660281|
|          GOOGLE-GEN|2.5079746580814026|
|                 GCS|2.4783822182366024|
|            SNAPCHAT|2.0085301277087053|
|      APPLE-FACETIME| 1.199483361872486|
|              AMAZON|1.0913469132307998|
|                 NTP|1.0530788686600319|
|     FACEBOOK-ACCESS| 0.951481474473398|
|          APPLE-PUSH|0.7863230666548091|
|              ICLOUD|0.7643415116367162|
|      NETFLIX-STREAM|0.7412485191145033|
|APPLE-IOS-UPDATE-SSL|0.7214599005420484|
|       GOOGLE-STATIC|0.7206061388829973|
|           ICMP-ECHO|0.7090377857689087|
|          GOOGLE-ADS|0.6766952307568638|
|             YOUTUBE|0.6739164433051222|
|          GOOGLE-API|0.5650104783934278|
|             OUTLOOK|0.5478654785332253|
|           MICROSOFT|0.5474332665872911|
+--------------------+------------

In [105]:
window_def = Window.partitionBy(F.col('org_id'), F.col('site_id'), ).orderBy(F.col('avg_z_score').desc())

site_top_app_df = final_df.withColumn('rank', F.rank().over(window_def)).filter('rank <= 10').persist()
site_top_app_df.select('site_id', 'normalized_app', 'avg_z_score', 'rank').show(80)
site_top_app_df.count()


+--------------------+--------------------+--------------------+----+
|             site_id|      normalized_app|         avg_z_score|rank|
+--------------------+--------------------+--------------------+----+
|99a28ff4-3d06-476...|       junos-ike-nat|   4.400441765963572|   1|
|99a28ff4-3d06-476...|           ICMP-ECHO|  2.4381338126747516|   2|
|99a28ff4-3d06-476...|                 IKE|  1.7869239025994872|   3|
|99a28ff4-3d06-476...|                 GCM|  1.3016769885914887|   4|
|99a28ff4-3d06-476...|                 NTP|  0.8310101203469655|   5|
|99a28ff4-3d06-476...|          GOOGLE-GEN|  0.6258209615651878|   6|
|99a28ff4-3d06-476...|       GOOGLE-STATIC|  0.6166792081824857|   7|
|99a28ff4-3d06-476...|          GOOGLE-API|  0.5732488612357116|   8|
|99a28ff4-3d06-476...|          APPLE-PUSH|  0.4670239675965691|   9|
|99a28ff4-3d06-476...|             YOUTUBE|  0.3629638861976596|  10|
|33584048-90e4-40d...|     FACEBOOK-ACCESS|  0.5427626378035869|   1|
|33584048-90e4-40d..