<a href="https://colab.research.google.com/github/jakhin03/PROJECT_ASnED_HUST/blob/main/src/pynote/ANOMALY_NETWORK_TRAFFIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PROJECT - ANORMAl NETWORK TRAFFIC DETECTION 

## Goals:
  * Detecting ANORMARL NETWORK TRAFFIC

## Folder structure:
```
.
├── Datasets
│   └── ...
├── Figure
│   └── ...
├── README.md
├── requirements.txt
└── src
    ├── app.py
    └── pynote
        └── ANOMALY_NETWORK_TRAFFIC.ipynb
```


## Usage:

In [None]:
!git clone "https://github.com/jakhin03/PROJECT_ASnED_HUST"

In [None]:
%cd PROJECT_ASnED_HUST

In [None]:
!pip install -r requirements.txt

In [None]:
%cd src/pynote

## 1. Import module and library

In [16]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_files
from pyvi import ViTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

%matplotlib inline

## 2. Preprocessing and data exploration:



### a. Datasets can find on: [source](https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup99.html)

In [None]:
data = pd.read_csv("../../Datasets/network_data.csv")
print(data.shape)
print(data.columns)
data.head()
print('Number of days for which data is available {:d}'.format(data['date'].nunique()))
print('Unique local ip {:d}'.format(data['l_ipn'].nunique()))
print('Unique remote ASN {:d}'.format(data['r_asn'].nunique()))
print('Minimum flow count per day {:d}'.format(data['f'].min()))
print('Maximum flow count per day {:d}'.format(data['f'].max()))
print(data.head())



### b. Exploring datasets:
* Create dataframe for visualization

In [None]:
dic = {'2006-08-24':1,'2006-09-04':5,'2006-09-18':4,'2006-09-26':3,'2006-09-26':6}
marked_anomalies = pd.DataFrame.from_dict(dic,orient='index')
marked_anomalies.reset_index(inplace = True)
marked_anomalies.columns = ['date','l_ipn']
print(marked_anomalies)

* Aggregating daily connections

In [None]:
daily_aggregate = data.groupby(['date'])[['f']].sum()
daily_aggregate.reset_index(inplace = True)
daily_aggregate[['f']].describe()

In [None]:
daily_mean = round(daily_aggregate['f'].mean(),2)
plt.figure(figsize=(15,5))
plt.plot(data['date'],data['f'])
[plt.axvline(x=_x, color='r' , label = 'Recorded Anomoly {}'.format(ip)) for _x,ip in list(marked_anomalies[['date','l_ipn']].to_records(index=False))]
plt.axhline(y= daily_mean, color='g', label = 'Mean Connections')
plt.plot(data['date'],data['f'].rolling(7).mean(), label = '7 days Rolling average')
plt.xticks(data['date'][::2],  rotation='vertical')
plt.yscale('log')
plt.xlabel('date')
plt.ylabel('Connection')
plt.title('Daily Aggregate Connections')
plt.fill_between(data['date'],data['f'],color='aqua')
plt.legend()
plt.show() 

In [None]:
daily_aggregate_l_ipn = data.groupby(['l_ipn','date'])[['f']].sum()
daily_aggregate_l_ipn.reset_index(inplace= True)

In [None]:
import matplotlib.dates as mdates

fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 15))

plot_row = 0
plot_col = 0

for i in range(data['l_ipn'].nunique()):
    temp = daily_aggregate_l_ipn[daily_aggregate_l_ipn['l_ipn'] == i]
    axes[plot_row,plot_col].set_title(i)
    axes[plot_row,plot_col].set_xlabel('date')
    axes[plot_row,plot_col].set_ylabel('connections')
    
    axes[plot_row,plot_col].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    
    axes[plot_row,plot_col].plot(temp['date'],temp['f'], color = 'salmon')
    axes[plot_row,plot_col].get_xaxis().set_visible(False)
    axes[plot_row,plot_col].fill_between(temp['date'],temp['f'], color='peachpuff')
    

    plot_col = plot_col + 1
    if(plot_col == 2):
        plot_row = plot_row + 1
        plot_col = 0
plt.show()

* Aggregating flows on per r_asn

In [None]:
daily_aggregate_r_asn = data.groupby(['r_asn'])[['f']].sum()
daily_aggregate_r_asn.reset_index(inplace = True)

In [None]:
pd.options.display.float_format = '{:.2f}'.format
daily_aggregate_r_asn['f'].describe()

In [None]:
plt.figure(figsize=(10,5))
plt.title(i)
plt.xlabel('r_asn')
plt.ylabel('connections')
plt.xticks(rotation='vertical')
#n_bins =  daily_aggregate_r_asn['r_asn']
#plt.hist(daily_aggregate_r_asn['f'], n_bins, histtype ='bar')
plt.plot(daily_aggregate_r_asn['r_asn'],daily_aggregate_r_asn['f'], color = 'salmon')
plt.show()

## 3. Anomaly detection
*Anomalies are marked in red dot in graph*

### a. Using prophet library

In [None]:
!pip install prophet

In [None]:
from pandas import to_datetime
from prophet import Prophet

In [None]:
def get_daily_aggregate_l_ipn(in_l_ipn):
    temp_data = daily_aggregate_l_ipn[daily_aggregate_l_ipn['l_ipn'] == in_l_ipn].drop(['l_ipn'],axis = 1)
    temp_data.columns = ['ds','y']
    temp_data['ds'] = to_datetime(temp_data['ds'])
    temp_data.reset_index(inplace=True,drop=True)
    return temp_data

In [None]:
def get_forecast(ts,in_l_ipn):
    
    model = Prophet(seasonality_mode='additive',daily_seasonality = False, yearly_seasonality = False, weekly_seasonality = True)
    model.fit(ts)
    forecast = model.predict(pd.DataFrame(ts['ds']))
    
    ts['anomaly'] = 0
    p_color = np.full((ts.shape[0],1),'green')
    for i in range(forecast.shape[0]):
        if((forecast.at[i,'yhat_lower'] > ts.at[i,'y']) or (forecast.at[i,'yhat_upper'] < ts.at[i,'y'])):
            ts.at[i,'anomaly'] = 1
            p_color[i] = 'red'
                
    model.plot(forecast)
    
    plt.scatter(ts['ds'],ts['y'],c=p_color.ravel())
    plt.title('Forcast plot for l_ipn %d' %in_l_ipn)
    plt.show()


In [None]:
for i in range(data['l_ipn'].nunique()):
    get_forecast(get_daily_aggregate_l_ipn(i),i)

### b. Using luminol:

In [None]:
!pip install luminol

In [None]:
import luminol
from luminol.anomaly_detector import AnomalyDetector

In [None]:
def get_luminol_anomalies(in_df):
    in_df['isAnomaly'] = 0
    detector = AnomalyDetector(in_df['y'].to_dict())
    anomalies = detector.get_anomalies()
    time_period = ()
    for j in range(len(anomalies)):
        time_period = anomalies[j].get_time_window()
        for k in time_period:
            in_df.at[k,'isAnomaly'] = 1     
    return(in_df)    

In [None]:
for i in range(data['l_ipn'].nunique()):
    t_df = get_luminol_anomalies(get_daily_aggregate_l_ipn(i))
    
    colors = {0:'green', 1:'red'}
   
    plt.figure(figsize=(10,5))
    plt.plot(t_df['ds'],t_df['y'])
    plt.scatter(t_df['ds'],t_df['y'],c=t_df['isAnomaly'].apply(lambda x: colors[x]))
    plt.title('Forcast plot for l_ipn %d' %i)
    plt.show()