In [12]:
from scipy.io import arff
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [13]:
# Load the ARFF file
data, meta = arff.loadarff('Dataset.arff')

In [26]:
df = pd.DataFrame(data)
print(df['URL_Length'].dtype)
df['URL_Length'] # We have byte-string data

object


0         b'1'
1         b'1'
2         b'0'
3         b'0'
4         b'0'
         ...  
11050    b'-1'
11051     b'1'
11052    b'-1'
11053    b'-1'
11054    b'-1'
Name: URL_Length, Length: 11055, dtype: object

In [5]:
# Decode all byte columns to UTF-8 or leave unchanged if not bytes
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [6]:
# Turn all numerical data into integers
df = df.apply(pd.to_numeric)
df

  df = df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1,-1,1,-1,1,1,1,1,-1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11051,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11052,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,-1,1,0,1,-1
11053,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,1,1,1,-1,1,1,1,-1


In [7]:
X = df.values[:,:-1] # Features (all but last column)
y = df.values[:,-1] # Labels (only last column)

In [8]:
# Convert to tensors and change dtype from numpys float64 (default) to pytorch float32 (default)
X = torch.from_numpy(X)
y = torch.from_numpy(y)
X, y = X.type(torch.float), y.type(torch.float)

In [9]:
X.shape,y.shape # Making sure shapes match. Features:(samples,features) | Labels:(labels)

(torch.Size([11055, 30]), torch.Size([11055]))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1) # 80/20 split

In [11]:
X_train.shape, y_train.shape # Data is ready

(torch.Size([8844, 30]), torch.Size([8844]))