# multiclass detection


In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random
import time


In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


traindata = pd.read_csv('kddcup.data_10_percent',header=None, names = col_names)
testdata = pd.read_csv('corrected', header=None, names = col_names)
print('Dimensions of the Traindata:',traindata.shape)
print('Dimensions of the Testdata:',testdata.shape)


Dimensions of the Traindata: (494021, 42)
Dimensions of the Testdata: (311029, 42)


In [3]:
print('Label distribution Training set:')
print(traindata['label'].value_counts())
print()
print('Label distribution Test set:')
print(testdata['label'].value_counts())


Label distribution Training set:
smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64

Label distribution Test set:
smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.              

## Data process

In [4]:
print('Training set:')
for feature in traindata.columns:
    if traindata[feature].dtypes == 'object' :
        unique_cat = len(traindata[feature].unique())
        print("Feature '{feature}' has {unique_cat} categories".format(feature=feature, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(traindata['service'].value_counts().sort_values(ascending=False).head())


Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 66 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
ecr_i      281400
private    110893
http        64293
smtp         9723
other        7237
Name: service, dtype: int64


## label encoder

In [5]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_feature=['protocol_type', 'service', 'flag']

train_categorical_values = traindata[categorical_feature]
test_categorical_values = testdata[categorical_feature]

train_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,http,SF
1,tcp,http,SF
2,tcp,http,SF
3,tcp,http,SF
4,tcp,http,SF


## rename the column

In [6]:
# protocol type
unique_protocol=sorted(traindata.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(traindata.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(traindata.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(testdata.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_hostnames', 'service_http', 'service_http_443', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net', 'service_ssh'

In [7]:
train_categorical_values_enc=train_categorical_values.apply(LabelEncoder().fit_transform)

print(train_categorical_values.head())
print('--------------------')
print(train_categorical_values_enc.head())

# test set
test_categorical_values_enc=test_categorical_values.apply(LabelEncoder().fit_transform)


  protocol_type service flag
0           tcp    http   SF
1           tcp    http   SF
2           tcp    http   SF
3           tcp    http   SF
4           tcp    http   SF
--------------------
   protocol_type  service  flag
0              1       22     9
1              1       22     9
2              1       22     9
3              1       22     9
4              1       22     9


## one-hot-encoder

In [8]:
enc = OneHotEncoder(categories='auto')
train_categorical_values_encenc = enc.fit_transform(train_categorical_values_enc)
train_cat_data = pd.DataFrame(train_categorical_values_encenc.toarray(),columns=dumcols)


# test set
test_categorical_values_encenc = enc.fit_transform(test_categorical_values_enc)
test_cat_data = pd.DataFrame(test_categorical_values_encenc.toarray(),columns=testdumcols)

train_cat_data.head()


Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## adjust to the same dimension

In [9]:
trainservice=traindata['service'].tolist()
testservice= testdata['service'].tolist()
difference=list(set(trainservice) - set(testservice))
difference1=list(set(testservice) - set(trainservice))
string = 'service_'
difference=[string + x for x in difference]
print(difference)
print(difference1)

['service_urh_i', 'service_red_i']
['icmp']


In [10]:
for col in difference:
    test_cat_data[col] = 0
for col in difference1:
    train_cat_data[col] = 0

print(train_cat_data.shape)    
print(test_cat_data.shape)


(494021, 81)
(311029, 81)


## combine all the column

In [11]:
#traindata
newtrain=traindata.join(train_cat_data)
newtrain.drop('flag', axis=1, inplace=True)
newtrain.drop('protocol_type', axis=1, inplace=True)
newtrain.drop('service', axis=1, inplace=True)

# test data
newtest=testdata.join(test_cat_data)
newtest.drop('flag', axis=1, inplace=True)
newtest.drop('protocol_type', axis=1, inplace=True)
newtest.drop('service', axis=1, inplace=True)

print(newtrain.shape)
print(newtest.shape)


(494021, 120)
(311029, 120)


## classify the label

In [13]:
#normal: 0;
#Probe = ("ipsweep.", "portsweep.", "nmap.", "satan." , "saint.", "mscan."):1
#UR2 = ("buffer_overflow.","loadmodule.", "perl.", "rootkit.", "xterm.", "ps.", "httptunnel.", "sqlattack.", "worm.", "snmpguess.", "spy." ):2
#DDoS = ('neptune.', 'smurf.', 'pod.', 'teardrop.', 'land.', "back.", "apache2.", "udpstorm.", "processtable.", "mailbomb."):3
#R2L = ("guess_passwd.", "ftp_write.", "imap.", "phf.", "multihop.", "warezmaster.", "warezclient.", "snmpgetattack.", "named.", "xlock." , "xsnoop.", "sendmail." ):4

labeltrain=newtrain['label']
labeltest=newtest['label']
newlabeltrain=labeltrain.replace({ 'normal.' : 0, 'ipsweep.':1, 'portsweep.':1,'nmap.':1,'satan.':1,'buffer_overflow.':2,'loadmodule.':2,'perl.':2,'rootkit.':2,'back.':3,'land.':3,'neptune.':3, 'pod.':3,'smurf.':3,
                            'teardrop.':3, 'ftp_write.':4,'guess_passwd.':4,'imap.':4,'multihop.':4, 'phf.':4,'spy.':4,'warezclient.':4,'warezmaster.':4 })
newlabeltest=labeltest.replace({ 'normal.' : 0, 'ipsweep.':1, 'portsweep.':1,'nmap.':1,'satan.':1,"saint.":1,"mscan.":1,'buffer_overflow.':2,'loadmodule.':2,'perl.':2,'rootkit.':2,"xterm.":2, "ps.":2, "httptunnel.":2, "sqlattack.":2, "worm.":2, "snmpguess.":2, "spy.":2, 
                            'back.':3,'land.':3,'neptune.':3, 'pod.':3,'smurf.':3,'teardrop.':3,"apache2.":3,  "udpstorm.":3, "processtable.":3, "mailbomb.":3,
                            'ftp_write.':4,'guess_passwd.':4,'imap.':4,'multihop.':4, 'phf.':4,'spy.':4,'warezclient.':4,'warezmaster.':4,'snmpgetattack.':4,'named.':4, "xlock.":4 , "xsnoop.":4, "sendmail.":4  })
# put the new label column back
newtrain['label'] = newlabeltrain
newtest['label'] = newlabeltest


In [14]:
#write csv for convenience
newtrain.to_csv("kdd_train_class.csv")
newtest.to_csv("kdd_test_class.csv")

In [15]:
newtrain.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,icmp
0,0,181,5450,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,0,239,486,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,0,235,1337,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,0,219,1337,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0,217,2032,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [16]:
X_train = newtrain.drop('label',1)
Y_train = newtrain.label

# test set
X_test = newtest.drop('label',1)
Y_test = newtest.label

  X_train = newtrain.drop('label',1)
  X_test = newtest.drop('label',1)


In [17]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train)
X_test = mm.fit_transform(X_test)

In [18]:
print(X_train.shape)
print(Y_train.shape)


(494021, 119)
(494021,)


## logistic regression

In [19]:
#split train and test data 40%
from sklearn.model_selection import train_test_split
train_data, valid_data, train_target, valid_target = train_test_split(X_train, Y_train, test_size=0.4, random_state=42)

In [20]:
#%%
import sklearn

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='multinomial',solver='sag',class_weight='balanced',max_iter=500)
#For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' handle multinomial loss

train0=time.time()
lr.fit(train_data,train_target) ##拟合模型
train1 = time.time() - train0
print("train_time:%.3fs\n" %train1)

score = lr.score(train_data,train_target)
print(score)

train_time:266.133s

0.9816640351942566




In [23]:
#%%
valid_score = lr.score(valid_data,valid_target)
print(valid_score)#0.8388
test_score = lr.score(X_test,Y_test)
print(test_score)#0.4505

0.9812761564503641
0.915612370550656


In [22]:
#%%
#prob for each class
y_pro = lr.predict_proba(X_test) 
print(y_pro)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


In [21]:

test0=time.time()
y_predict = lr.predict(X_test)
test1=time.time()-test0
print("test_time:%.3fs\n" %test1)


print(y_predict)
from sklearn.metrics import classification_report
print('test index:\n',classification_report(Y_test,y_predict,digits=4))

test_time:0.099s

[0 0 0 ... 0 0 0]
test index:
               precision    recall  f1-score   support

           0     0.7364    0.9609    0.8338     60593
           1     0.7054    0.8970    0.7897      4166
           2     0.0267    0.0190    0.0222      2636
           3     0.9951    0.9614    0.9780    229853
           4     0.6565    0.1301    0.2172     13781

    accuracy                         0.9156    311029
   macro avg     0.6240    0.5937    0.5682    311029
weighted avg     0.9176    0.9156    0.9055    311029

