# Loading data
- Initially, I had multiple datasets containing lists of only ad servers and non-ad servers. I combined them all to create a dataset 'all.csv'.
- Since all.csv had multiple overlapping entries, I deleted the duplicates and saved it as another file 'all-without-duplicates'
```
df = pd.read_csv("../lists/all.csv",converters={'domain': convert_dtype,'class': convert_dtype}) 
df = df.drop_duplicates()
df.to_csv('../lists/all-without-duplicates.csv')

```


In [3]:
import pandas as pd
import re
import traceback
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping


2023-02-21 13:20:48.941347: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 13:20:49.058630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-21 13:20:49.058653: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-21 13:20:49.813248: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [None]:
#Convert dtypes for fixing Dtypewarning
# https://www.roelpeters.be/solved-dtypewarning-columns-have-mixed-types-specify-dtype-option-on-import-or-set-low-memory-in-pandas/
def convert_dtype(x):
    if not x:
        return ''
    try:
        return str(x)   
    except:        
        return ''

df = pd.read_csv("../lists/all-without-duplicates.csv",converters={'domain': convert_dtype,'class': convert_dtype}) # Dataset is now stored in a Pandas Dataframe
#df = pd.read_csv("../lists/all.csv",converters={'domain': convert_dtype,'class': convert_dtype})
#df = df.drop_duplicates()
#df['class'] = df['class'].map({'1': '0', '0': '1'}) # issue w original dataset where non ads were marked as 1 and ads as 0. This reverses it.
#df.to_csv('../lists/all-without-duplicates.csv')
df

# Preprocessing and feature extraction
This block of code is used for preprocessing the dataset, removing unwanted patterns, and extracting meaningful features from the dataset. Here, the features extracted are has_ad(does it contain the word 'ad'), is_subdomain(does it contain the subdomain 'www'),num_dots(number of dots in the url, excluding subdomain if any),num_hyphens(number of hyphens), num_digits(number of digits in the URL)


In [32]:
# Define regular expressions for pattern matching
ad_pattern = r'\b(ad|ads)\b'
subdomain_pattern = r'^www\.'
dot_pattern = r'.'
hyphen_pattern = r'-'
digit_pattern = r'\d'

# Define the batch size and the input/output file paths
batch_size = 10000
input_file = '../lists/all-without-duplicates.csv'
output_file = '../lists/preprocessed.csv'

# Open the input and output files
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    # Read the CSV file in chunks
    for chunk in pd.read_csv(f_in, chunksize=batch_size):
        # Preprocess the URLs in the current chunk
        #for url in chunk['url']:
        for index, row in chunk.iterrows():
            url = row['url']
            is_ad = row['class']
            has_ad = int(bool(re.search(ad_pattern, url)))
            is_subdomain = int(bool(re.search(subdomain_pattern, url)))
            num_dots = url.count(dot_pattern) #- is_subdomain
            if (is_subdomain == 1):
                num_dots = num_dots - 1;
            num_hyphens = url.count(hyphen_pattern)
            num_digits = len(re.findall(digit_pattern, url))

            # Write the preprocessed features to the output file
            f_out.write(f'{url},{has_ad},{is_subdomain},{num_dots},{num_hyphens},{num_digits},{is_ad}\n')


In [None]:


# Set the random seed for reproducibility
np.random.seed(42)

# Define the batch size, the number of epochs, and the input file path
batch_size = 32
epochs = 100
input_file = 'output.csv'

# Load the preprocessed features and the class label into a Pandas DataFrame
data = pd.read_csv(input_file, header=None, names=['url', 'has_ad', 'is_subdomain', 'num_dots', 'num_hyphens', 'num_digits', 'class'])

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['class'])

# Define the input shape of the neural network
input_shape = (train_data.shape[1] - 1,)

# Build the neural network model
model = Sequential()
model.add(Dense(64, input_shape=input_shape, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Define the early stopping criteria
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1], batch_size=batch_size, epochs=epochs, validation_data=(val_data.iloc[:, :-1], val_data.iloc[:, -1]), callbacks=[early_stop])

# Evaluate the model on the validation set
loss, acc = model.evaluate(val_data.iloc[:, :-1], val_data.iloc[:, -1], batch_size=batch_size)
print('Validation accuracy:', acc)
