In [3]:
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import MultiLabelBinarizer

from sampling.robrose import RobRoseAlgorithm
from sampling.adasyn import Adasyn
from sampling.smote import SmoteAlgorithm
from sklearn.model_selection import train_test_split

In [4]:
spam_df = pd.read_csv("../data/spam_text_msg.csv")
spam_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
text_data = []
for text in spam_df['Message'][:2000]:
    new_text = text.translate(str.maketrans('', '', string.punctuation))
    #new_text = new_text.strip()
    text_data.append(new_text)

In [6]:
df = pd.DataFrame()
df['Category'] = spam_df[:2000]['Category']
df['Label'] = text_data

df.head()

Unnamed: 0,Category,Label
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...


In [7]:
dummies = df['Label'][:2000].str.get_dummies(' ')

In [8]:
df = pd.concat([df, dummies],1)

In [9]:
df

Unnamed: 0,Category,Label,0,0089my,0125698789,020603,0207,02072069400,020903,021,...,£75000,£79,£800,£900,Ü,ü,üll,–,“Harry,…
0,ham,Go until jurong point crazy Available only in ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,Ok lar Joking wif u oni,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,U dun say so early hor U c already then say,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,Nah I dont think he goes to usf he lives aroun...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,ham,Have you been practising your curtsey,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,ham,Shall i come to get pickle,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,ham,Lol boo I was hoping for a laugh,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,ham,YEH I AM DEF UP4 SOMETHING SATJUST GOT PAYED2D...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# PreProcessing
y = df["Category"]
x = df.drop(['Category', 'Label'], axis=1).iloc[:, : 2000]

x_train, x_test, y_train, y_test = train_test_split(x.to_numpy(), y.to_numpy(), train_size=0.80, random_state=4012)

In [18]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 2000)
(400, 2000)
(1600,)
(400,)


In [19]:
# SMOTE Sampling
smote_x_train, smote_y_train = SmoteAlgorithm.run(x_train, y_train)
print(smote_x_train.shape)
print(smote_y_train.shape)



(2768, 2000)
(2768,)


In [20]:
adasyn_x_train, adasyn_y_train = Adasyn.run(x_train, y_train)
print(adasyn_x_train.shape)
print(adasyn_y_train.shape)



(2793, 2000)
(2793,)


In [21]:
robrose_x_train, robrose_y_train = RobRoseAlgorithm.run(x_train, y_train, label="Category", columns=dummies.columns[:2000], r=0.2, alpha=0.99, const=1, seed=4012)
print(robrose_x_train.shape)
print(robrose_y_train)

Loading required package: usethis


/var/folders/58/08szz5hj0kqbp3tnvthkxh000000gn/T/tmplml18o2z.csv


Error in covMcd(X[ind.mino, id.num], alpha = alpha, nsamp = 500) : 
  n <= p -- you can't be serious!
Calls: robROSE -> covMcd
Execution halted


EmptyDataError: No columns to parse from file

## New Dataset

In [4]:
df = pd.read_csv("../data/Swarm_Behaviour.csv")
df.head()

Unnamed: 0,x1,y1,xVel1,yVel1,xA1,yA1,xS1,yS1,xC1,yC1,...,yVel200,xA200,yA200,xS200,yS200,xC200,yC200,nAC200,nS200,Swarm_Behaviour
0,562.05,-0.62,-10.7,-4.33,0.0,0.0,0.0,0.0,0.0,0.0,...,-15.15,0.0,0.0,0.0,0.0,0.0,0.0,28,0,0.0
1,175.66,-57.09,2.31,-2.67,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.48,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0.0
2,200.16,-320.07,4.01,-6.37,0.0,0.0,0.0,0.0,0.18,-0.26,...,-9.38,0.0,0.0,0.0,0.0,-0.11,-0.3,15,1,0.0
3,316.99,-906.84,0.85,9.17,-0.17,1.03,0.0,0.0,0.0,0.0,...,10.39,-0.26,1.01,0.0,0.0,0.0,0.0,16,0,0.0
4,1277.68,908.54,-2.02,8.23,-1.0,1.0,0.0,0.0,0.0,0.0,...,13.91,-1.0,0.0,3.21,15.67,0.0,0.0,12,0,0.0


In [5]:
# PreProcessing
new_df = df
y = new_df["Swarm_Behaviour"]
x = new_df.drop(['Swarm_Behaviour'], axis=1)
df.drop_duplicates(subset=df.columns, keep=False)
x_train, x_test, y_train, y_test = train_test_split(x.to_numpy(), y.to_numpy(), train_size=0.80, random_state=4012)

In [9]:
len(new_df[new_df["Swarm_Behaviour"]==1])

7954

In [6]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(18647, 2400)
(4662, 2400)
(18647,)
(4662,)


In [10]:
robrose_x_train, robrose_y_train = McdSmote.run(x_train, y_train)

Number of original fraud rows = 6404
Number of original normal rows = 12243




KeyboardInterrupt: 

In [None]:
len(x.columns)

In [None]:
robrose_x_train, robrose_y_train = RobRoseAlgorithm.run(x_train, y_train, label="Swarm_Behaviour", columns= x.columns, r=0.2, alpha=0.9, const=1, seed=4012)
print(robrose_x_train.shape)
print(robrose_y_train)

Loading required package: robROSE
Rows: 8000 Columns: 2401
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (2401): x1, y1, xVel1, yVel1, xA1, yA1, xS1, yS1, xC1, yC1, nAC1, nS1, x...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [None]:
robrose_x_train.to_csv("../data/dimx_robrose.csv")
robrose_y_train.to_csv("../data/dimy_robrose.csv")

## Use Credit Card Dataset

In [1]:
from sampling.sampling import SamplingAlgorithm
from model.model import ClassifierWrapper
from preprocess.preprocess import DatasetWrapper
from config.config import ALGORITHMS, DATASETS, MODELS
from sampling.mcd_smote import McdSmote

Using TensorFlow backend.


In [26]:
# PreProcessing
df = DATASETS[0][0](DATASETS[0][1])


In [28]:
robrose_x_train, robrose_y_train = RobRoseAlgorithm.run(df.x_train, df.y_train, label="Class", columns= df.columns, r=0.2, alpha=0.9, const=1, seed=4012)
print(robrose_x_train.shape)
print(robrose_y_train)

Loading required package: robROSE
Rows: 227845 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (30): V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


/var/folders/58/08szz5hj0kqbp3tnvthkxh000000gn/T/tmpn4pzqsmm.csv
(284271, 29)
[0 0 0 ... 1 1 1]
