## 5.0. Used Libraries

In [2]:
import pandas as pd #more info at http://pandas.pydata.org/
import numpy as np #more info at http://www.numpy.org/
import matplotlib.pyplot as plt #some examples for you at http://matplotlib.org/gallery.html 
from matplotlib import gridspec #more info at http://matplotlib.org/api/gridspec_api.h}tml
import matplotlib.dates as dt
from matplotlib.patches import Rectangle
from matplotlib import ticker

from matplotlib_venn import *
%matplotlib inline 

import seaborn as sns
plt.style.use('ggplot') #For improving the visualization style (options: grayscale, bmh, dark_background, ggplot, and fivethirtyeight)
# plt.style.use('seaborn-muted')

import re

import warnings
warnings.filterwarnings('ignore') #To avoid showing annoying warns

from IPython.display import display

## 5.1. Attacks per day 

In [None]:
print('1. Attacks per day')

In [None]:
if len(df_attacks)>0:
    attack_timeseries=df_attacks.set_index(['date']).groupby(pd.Grouper(freq='M')).agg(['count'])['targetip']
    attack_mean_perday=attack_timeseries.mean()
    attack_median_perday=attack_timeseries.median()
else:
    attack_timeseries=""

In [None]:
if len(attack_timeseries)>0:
    fig = plt.figure(figsize=(10,4))

    ax1 = plt.subplot2grid((1,1), (0,0), rowspan=2)
    attack_timeseries.plot(ax=ax1,
                           legend=False,
                           kind='bar'
                          )

    # X and Y Labels and Ticks
    ax1.set_xlabel("Time (bin=month)")
    ax1.set_ylabel("# attacks")


    fig.savefig('figs/attacks_day.eps', bbox_inches='tight',format='eps', dpi=1200)

else:
    print("Unfortunately, there is no data available!\n")

## 5.2. Attacks performed by users

In [None]:
print('2. Number of attacks performed by users')

In [None]:
user_attacks=df_attacks['username'].value_counts()
freq_user_attacks=user_attacks.value_counts().sort_index()

In [None]:
if len(user_attacks)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    freq_user_attacks.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(user_attacks))
    ax2.set_ylabel("# distinct user (attackers)")

    labels = list(freq_user_attacks.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    plt.setp(ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/user_attacks.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")

## 5.3 Attacks on the same target

In [None]:
print('3. Attacks on the same target')

In [None]:
num_attacks_on_sametarget=df_attacks['targetip'].value_counts()
freq_num_attacks_on_sametarget=num_attacks_on_sametarget.value_counts()

In [None]:
if len(num_attacks_on_sametarget)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    freq_num_attacks_on_sametarget.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(num_attacks_on_sametarget))
    ax2.set_ylabel("# distinct target")

    labels = list(freq_num_attacks_on_sametarget.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    plt.setp(ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/attacks_same_target.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")

## 5.4. Intersection between users, customers, and attackers (venn diagram)

In [None]:
print('4. Intersection between users, customers and attackers.')

In [None]:
users_set=set(df_users['userid'].dropna().unique())
customers_set=set(df_payments['userid'].dropna().unique())
attackers_set=set(df_attacks['userid'].dropna().unique())

intersec_customers_attacker=pd.Series(list(customers_set.intersection(attackers_set)))
intersec_users_customers=pd.Series(list(users_set.intersection(customers_set)))
intersec_users_attackers=pd.Series(list(users_set.intersection(attackers_set)))

intersec_users_customers_attackers=pd.Series(list(users_set.intersection(customers_set).intersection(attackers_set)))

In [None]:
fig = plt.figure(figsize=(8,8))

ax = plt.subplot2grid((1,1), (0,0))

venn=venn3(ax=ax,subsets = {'001':len(attackers_set)-len(intersec_customers_attacker)-len(intersec_users_attackers)+len(intersec_users_customers_attackers), 
                            '010':len(customers_set)-len(intersec_users_customers)-len(intersec_customers_attacker)+len(intersec_users_customers_attackers), 
                            '011':len(intersec_customers_attacker)-len(intersec_users_customers_attackers),
                            '100':len(users_set)-len(intersec_users_customers)-len(intersec_users_attackers)+len(intersec_users_customers_attackers),
                            '101':len(intersec_users_attackers)-len(intersec_users_customers_attackers),
                            '110':len(intersec_users_customers)-len(intersec_users_customers_attackers),
                            '111':len(intersec_users_customers_attackers)},\
          set_labels = ('Users', 'Customers','Attackers'),\
          alpha=1)
fig.show()
fig.savefig('figs/user_customers_attackers.eps', format='eps', dpi=1200) 

## 5.5. Distribution of login times per user

In [None]:
print('5. User login times')

In [None]:
num_distinct_logins_per_user=df_logins['userid'].value_counts()
freq_distinct_logins_per_user=num_distinct_logins_per_user.value_counts().sort_index()

In [None]:
if len(num_distinct_logins_per_user)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    freq_distinct_logins_per_user.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(num_distinct_logins_per_user))
    ax2.set_ylabel("# distinct users")

    ax2.legend(bbox_to_anchor=(2.9, 1.05),fontsize=11)

    plt.setp( ax2.get_xticklabels(), visible=False)

    fig.suptitle('')
    fig.show()
     
    fig.savefig('figs/user_login_times.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("There is NO data available for plotting 'User login times'")

## 5.6. The time difference between an attack and the closest (previous) login of a user

In [None]:
def nearestDate(base_date, date_list):
    nearest={}
    for date in date_list:
        if (base_date.timestamp() - date.timestamp())>=0:
            nearest[base_date.timestamp() - date.timestamp()]= date 
    if len(nearest)== 0:
        return np.nan     
    
    else:
        return nearest[min(nearest.keys())]

In [None]:
df_attacks['nearestlogin']=""
df_attacks['nearestlogin']=pd.to_datetime(df_attacks['nearestlogin'])
    
if len(df_attacks)>0 and len(df_logins)>0:
    #When was the last login of the user that performed attacks
    df_attacks['nearestlogin']=""
    df_attacks['nearestlogin']=pd.to_datetime(df_attacks['nearestlogin'])

    for index, row in df_attacks.iterrows():
        intermediate_df= df_logins[df_logins['userid']==row['userid']]
        nearestlogindate= nearestDate(row['date'],intermediate_df['date'])
        df_attacks.at[index, 'nearestlogin'] = nearestlogindate

In [None]:
time_difference_login_attack = (df_attacks['date']-df_attacks['nearestlogin'])

In [None]:
print('\n\n6. From '+str(len(time_difference_login_attack))+' total attack records, '+\
      str(len(time_difference_login_attack.dropna()))+' have a login time to be correlated with. '+
      'The difference between the attack and the closest login time is on average '+str(time_difference_login_attack.dropna().mean())+
      ', while the median is equal to '+str(time_difference_login_attack.dropna().median()))

## 5.7 Users that login via Tor

In [None]:
tor_users = df_logins_torcheck[df_logins_torcheck['tor']==True]['userip'].value_counts()
num_tor_users = len(tor_users)

In [None]:
print('\n\n7. The number of distinct users that access this Booter via TOR is equal to '+str(num_tor_users)+'\n\n')

## 5.8. Distribution of IP addresses per user

In [None]:
print('8. User distinct IP addresses')

In [None]:
num_distinct_ips_per_user=df_logins.groupby(['userid','userip']).size().reset_index()['userid'].value_counts()
freq_distinct_ips_per_user=num_distinct_ips_per_user.value_counts().sort_index()

In [None]:
if len(num_distinct_ips_per_user)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    freq_distinct_ips_per_user.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(num_distinct_ips_per_user))
    ax2.set_ylabel("# distinct users")

    ax2.legend(bbox_to_anchor=(2.9, 1.05),fontsize=11)

    plt.setp( ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/user_distinct_ips.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("There is NO data available for plotting 'User distinct IP addresses'")

## 5.9. Distribution of Number of Payments by Users

In [None]:
print('9. User distinct payments')

In [None]:
num_distinct_payments_per_user=df_payments['userid'].value_counts()
freq_distinct_payments_per_user=num_distinct_payments_per_user.value_counts().sort_index()

In [None]:
if len(num_distinct_payments_per_user)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    freq_distinct_payments_per_user.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(num_distinct_payments_per_user))
    ax2.set_ylabel("# distinct users")

    ax2.legend(bbox_to_anchor=(2.9, 1.05),fontsize=11)

    plt.setp( ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/user_distinct_payments.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("There is NO data available for plotting 'User distinct payments'")

## 5.10. Distribution of the money paid by Users

In [None]:
print('10. Amount of money paid.')

In [None]:
num_distinct_payments_money_per_user=pd.to_numeric(df_payments['amountpaid'], errors='coerce').value_counts().sort_index()

In [None]:
if len(num_distinct_payments_money_per_user)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    num_distinct_payments_money_per_user.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(df_payments['amountpaid']))
    ax2.set_ylabel("# records")

    plt.setp( ax2.get_xticklabels(), visible=False)
    
    labels = list(num_distinct_payments_money_per_user.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    fig.show()
    fig.savefig('figs/amount_paid.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("There is NO data available for plotting 'Amount of money paid'.")

## 5.11. Total amount of money earned

In [None]:
total_earned=pd.to_numeric(df_payments['amountpaid'], errors='coerce').sum()
print('\n\n11. The (estimated) amount of money earned by this Booter is equal to US$ {:,.2f}\n\n'.format(float(total_earned)))

## 5.12. Distribution of Countries that users access Booters

In [None]:
print('12. Countries from where users (IPs) accessed the Booter.')

In [None]:
logins_country_distribution=df_logins_extended['srccountry'].value_counts(dropna=False)

In [None]:
if len(logins_country_distribution)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    logins_country_distribution.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(df_logins_extended['srccountry']))
    ax2.set_ylabel("# records")

    labels = list(logins_country_distribution.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    plt.setp(ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/user_countries.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")

## 5.13. Countries of blacklisted IPs

In [None]:
print('13. Countries of blacklisted IPs.')

In [None]:
blacklist_country_distribution=df_blacklist_extended['blacklistcountry'].value_counts(dropna=False)

In [None]:
if len(blacklist_country_distribution)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    blacklist_country_distribution.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(df_blacklist_extended['blacklistcountry']))
    ax2.set_ylabel("# blacklisted IP")

    labels = list(blacklist_country_distribution.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    plt.setp(ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/blacklisted_countries.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")

## 5.14. Target Countries

In [None]:
print('14. Target (IP) countries.')

In [None]:
attacks_country_distribution=df_attacks_extended['country'].value_counts(dropna=False)

In [None]:
if len(attacks_country_distribution)>0:
    fig = plt.figure(figsize=(1, 4))
    fig.subplots_adjust(hspace=0.5,wspace=0.3)

    ax2 = plt.subplot2grid((1,1), (0,0))

    ax1 = ax2.twinx()
    ax1.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

    attacks_country_distribution.to_frame().T.plot(kind='bar',ax=ax2, legend=True, stacked=True, title="",fontsize=12).set_zorder(1)
    ax2.set_xlabel("")
    ax2.set_ylim(0,len(df_attacks_extended['country']))
    ax2.set_ylabel("# attacks")

    labels = list(attacks_country_distribution.index.values)
    legend_show_top=10
    ax2.legend(ax2.patches[0:legend_show_top], 
           labels[0:legend_show_top], 
           bbox_to_anchor=(2.9, 1.05),
           fontsize=11)

    plt.setp(ax2.get_xticklabels(), visible=False)

    fig.show()
    fig.savefig('figs/target_countries.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")

## 5.16. Statistics on which users attacked which targets, in the country level

In [None]:
print('16. Who attacked whom in the country level.')

In [None]:
def nearestDate(base_date, date_list):
    nearest={}
    for date in date_list:
        if (base_date.timestamp() - date.timestamp())>=0:
            nearest[base_date.timestamp() - date.timestamp()]= date
    if len(nearest)==0:
        return np.nan
    else:
        return nearest[min(nearest.keys())]

In [None]:
df_attacks_extended['nearestlogin']=""
df_attacks_extended['nearestlogin']=pd.to_datetime(df_attacks_extended['nearestlogin'])
    
if len(df_attacks_extended)>0 and len(df_logins_extended)>0:
    #When was the last login of the user that performed attacks
    df_attacks_extended['nearestlogin']=""
    df_attacks_extended['nearestlogin']=pd.to_datetime(df_attacks_extended['nearestlogin'])

    for index, row in df_attacks_extended.iterrows():
        intermediate_df= df_logins_extended[df_logins_extended['userid']==row['userid']]
        nearestlogindate= nearestDate(row['date'],intermediate_df['date'])
        df_attacks_extended.at[index, 'nearestlogin'] = nearestlogindate

In [None]:
df_logins_and_attacks = pd.merge(df_attacks_extended, df_logins_extended, how='left', left_on='nearestlogin', right_on='date')
attacker_target_country = df_logins_and_attacks[df_logins_and_attacks['nearestlogin'].notnull()][['srccountry','country']]
who_against_whom=attacker_target_country.groupby(['srccountry','country']).size().reset_index().pivot('srccountry','country',0)

In [None]:
if len(who_against_whom)>0:
    fig = plt.figure(figsize=(32,32))

    ax1 = plt.subplot2grid((1,1), (0,0))
    sns.set()
    sns.heatmap(who_against_whom,
                ax=ax1,
                 cmap="YlOrRd",
                linewidths=.5,
                annot=True,
                 fmt='g'
                )

    ax1.set_ylabel("Attacker Country")
    ax1.set_xlabel("Target Country")

    fig.show()
    fig.savefig('figs/who_attack_whom.eps', bbox_inches='tight',format='eps', dpi=1200)
else:
    print("Unfortunately, there is no data available!\n")