In [112]:
import pandas as pd

In [113]:
df = pd.read_csv("cleanedNumDarknet.csv")
X = df.drop("traffic nature",axis=1)
y = df.loc[:,["traffic nature"]]
y

Unnamed: 0,traffic nature
0,0
1,0
2,0
3,0
4,0
...,...
110381,2
110382,2
110383,2
110384,2


# Splitting dataset into training and testing sets :

In [114]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [115]:
test_dataset = pd.concat([X_test,y_test],axis=1)

In [116]:
test_dataset.shape

(22078, 29)

In [117]:
test_dataset.to_csv("testDataset.csv",index=False)

# counting target values occurances :("traffic nature") 

In [118]:
df2 = pd.DataFrame(y_train)

In [119]:
CLASSES_INSTANCES = df2.value_counts().to_list()
CLASSES_INSTANCES

[55167, 17184, 15005, 952]

# defining oversampling strategy , each class with the desired number of instances :

In [120]:
sampling_strategy = {
    0:CLASSES_INSTANCES[0],
    1:CLASSES_INSTANCES[1]+(CLASSES_INSTANCES[0]-CLASSES_INSTANCES[1])//3,
    2:CLASSES_INSTANCES[2]+(CLASSES_INSTANCES[0]-CLASSES_INSTANCES[2])//3,
    3:CLASSES_INSTANCES[3]+(CLASSES_INSTANCES[0]-CLASSES_INSTANCES[3])//3
    }

In [121]:
smote = SMOTE(sampling_strategy=sampling_strategy)
adasyn = ADASYN(sampling_strategy=sampling_strategy)
borderline_smote = BorderlineSMOTE(sampling_strategy=sampling_strategy)

In [122]:
X_train_resampled_smote, y_train_resampled_smote = smote.fit_resample(X_train, y_train)
X_train_resampled_adasyn, y_train_resampled_adasyn = adasyn.fit_resample(X_train, y_train)
X_train_resampled_borderline, y_train_resampled_borderline = borderline_smote.fit_resample(X_train, y_train)


In [123]:
y_train_resampled_smote.shape

(132427, 1)

In [124]:
X_train_resampled = pd.concat([X_train_resampled_smote, X_train_resampled_adasyn, X_train_resampled_borderline])
y_train_resampled = pd.concat([y_train_resampled_smote, y_train_resampled_adasyn, y_train_resampled_borderline])

In [125]:
X_train_resampled.shape

(397026, 28)

In [126]:
newDF = pd.concat([X_train_resampled,y_train_resampled],axis=1)

In [127]:
newDF.shape

(397026, 29)

In [128]:
newDF = newDF.drop_duplicates()

In [129]:
newDF.shape

(213317, 29)

In [130]:
newDF['traffic nature'].value_counts()

0    55167
2    53969
1    52846
3    51335
Name: traffic nature, dtype: int64

# Min-max normalization :

In [131]:
# X = newDF

In [132]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# X

In [133]:
# X['traffic nature'] *=3

In [134]:
# X["traffic nature"] = X["traffic nature"].astype(int)

In [135]:
# X

In [136]:
newDF.to_csv("balancedDarknet.csv",index=False)