In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
import re
import datetime

In [0]:
def preprocess_df_and_save_to_csv(df, ip, dest_path):
  dest_file_path = f'{dest_path}/{ip}.csv'
  if os.path.isfile(dest_file_path):
    with open(dest_file_path, 'a') as outfile:
      c = csv.writer(outfile)
      for index, row in df.iterrows():
        c.writerow([sum(bytearray(cell, encoding='utf8')) if isinstance(cell, str) else cell for cell in row.values]) #convert to numeric for string data

  else:
    count = 0
    with open(dest_file_path, 'a') as outfile:
      c = csv.writer(outfile)
      for index, row in df.iterrows():
        if count==0:
          count = 1
          c.writerow(df.columns)
        c.writerow([sum(bytearray(cell, encoding='utf8')) if isinstance(cell, str) else cell for cell in row.values])

In [0]:
def create_ip_profile(src_file_path, dest_path, features_list):
  df = pd.read_csv(src_file_path)
  df = df[features_list]  #feature selection
  df['Receive Time'] = df['Receive Time'].apply(lambda x : x[-8:])  #remove date information from dataframe
  ips = df['Source address'].unique() #get a list of unique ips
  print(f'{len(ips)} ips found')
  for ip in ips:  #create csv file for individual ips
    temp = df[df['Source address']==ip]
    #call method to write to csv file
    preprocess_df_and_save_to_csv(temp, ip, dest_path)

In [0]:
def parse_all_csv(src_path, dest_path, features_list):
  if not os.path.exists(dest_path):
    os.mkdir(dest_path)
    print(f'**********Directory {dest_path} created **********')
  if os.path.exists(src_path):
    files = os.listdir(src_path)
    total = len(files)
    count = 0
    for file in os.listdir(src_path):
      src_file_path = src_path+'/'+file
      print(f'[{count}/{total}]**********Processing {src_file_path} file **********')
      create_ip_profile(src_file_path, dest_path, features_list)
      count = count+1
  else:
    print(f'{src_path} doesnt exist')

In [0]:
features_list = ['Receive Time', 'Source address',
                  'Destination address',
                  'Application',
                  'Source Port',
                  'Destination Port',
                  'IP Protocol',
                  'Action',
                  'Bytes',
                  'Bytes Sent',
                  'Bytes Received',
                  'Packets',
                  'Destination Country',
                  'Packets Sent',
                  'Packets Received']

In [0]:
parse_all_csv("drive/My Drive/Colab Notebooks/Pattern Recognition/1-Month-4-27-to-5-27-Traffic",
              "drive/My Drive/Colab Notebooks/Pattern Recognition/individual_ip_csv",
              features_list)

**********Directory drive/My Drive/Colab Notebooks/Pattern Recognition/individual_ip_csv created **********
[0/31]**********Processing drive/My Drive/Colab Notebooks/Pattern Recognition/1-Month-4-27-to-5-27-Traffic/Silverlining-PAVM-Primary_traffic_2019_05_02_last_calendar_day.csv file **********
90 ips found
[1/31]**********Processing drive/My Drive/Colab Notebooks/Pattern Recognition/1-Month-4-27-to-5-27-Traffic/Silverlining-PAVM-Primary_traffic_2019_05_07_last_calendar_day.csv file **********
180 ips found
[2/31]**********Processing drive/My Drive/Colab Notebooks/Pattern Recognition/1-Month-4-27-to-5-27-Traffic/Silverlining-PAVM-Primary_traffic_2019_05_18_last_calendar_day.csv file **********
183 ips found
[3/31]**********Processing drive/My Drive/Colab Notebooks/Pattern Recognition/1-Month-4-27-to-5-27-Traffic/Silverlining-PAVM-Primary_traffic_2019_05_20_last_calendar_day.csv file **********
180 ips found
[4/31]**********Processing drive/My Drive/Colab Notebooks/Pattern Recognition