# Loading data
- Initially, I had multiple datasets containing lists of only ad servers and non-ad servers. I combined them all to create a dataset 'all.csv'.
- Since all.csv had multiple overlapping entries, I deleted the duplicates and saved it as another file 'all-without-duplicates'
```
df = pd.read_csv("../lists/all.csv",converters={'domain': convert_dtype,'class': convert_dtype}) 
df = df.drop_duplicates()
df.to_csv('../lists/all-without-duplicates.csv')

```


In [48]:
import pandas as pd
import re
import traceback
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import tensorflow

In [24]:
#Convert dtypes for fixing Dtypewarning
# https://www.roelpeters.be/solved-dtypewarning-columns-have-mixed-types-specify-dtype-option-on-import-or-set-low-memory-in-pandas/
def convert_dtype(x):
    if not x:
        return ''
    try:
        return str(x)   
    except:        
        return ''

df = pd.read_csv("../lists/all-without-duplicates.csv",converters={'domain': convert_dtype,'class': convert_dtype}) # Dataset is now stored in a Pandas Dataframe
#df = pd.read_csv("../lists/all.csv",converters={'domain': convert_dtype,'class': convert_dtype})
#df = df.drop_duplicates()
#df['class'] = df['class'].map({'1': '0', '0': '1'}) # issue w original dataset where non ads were marked as 1 and ads as 0. This reverses it.
#df.to_csv('../lists/all-without-duplicates.csv')
df

Unnamed: 0,url,class
0,google.com,0
1,youtube.com,0
2,facebook.com,0
3,amazonaws.com,0
4,netflix.com,0
...,...,...
1474708,slview.psne.jp,1
1474709,x.vipergirls.to,1
1474710,x0r.urlgalleries.net,1
1474711,yotta.scrolller.com,1


# Preprocessing and feature extraction
This block of code is used for preprocessing the dataset, removing unwanted patterns, and extracting meaningful features from the dataset. Here, the features extracted are has_ad(does it contain the word 'ad'), is_subdomain(does it contain the subdomain 'www'),num_dots(number of dots in the url, excluding subdomain if any),num_hyphens(number of hyphens), num_digits(number of digits in the URL)


In [32]:
# Define regular expressions for pattern matching
ad_pattern = r'\b(ad|ads)\b'
subdomain_pattern = r'^www\.'
dot_pattern = r'.'
hyphen_pattern = r'-'
digit_pattern = r'\d'

# Define the batch size and the input/output file paths
batch_size = 10000
input_file = '../lists/all-without-duplicates.csv'
output_file = '../lists/preprocessed.csv'

# Open the input and output files
with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
    # Read the CSV file in chunks
    for chunk in pd.read_csv(f_in, chunksize=batch_size):
        # Preprocess the URLs in the current chunk
        #for url in chunk['url']:
        for index, row in chunk.iterrows():
            url = row['url']
            is_ad = row['class']
            has_ad = int(bool(re.search(ad_pattern, url)))
            is_subdomain = int(bool(re.search(subdomain_pattern, url)))
            num_dots = url.count(dot_pattern) #- is_subdomain
            if (is_subdomain == 1):
                num_dots = num_dots - 1;
            num_hyphens = url.count(hyphen_pattern)
            num_digits = len(re.findall(digit_pattern, url))

            # Write the preprocessed features to the output file
            f_out.write(f'{url},{has_ad},{is_subdomain},{num_dots},{num_hyphens},{num_digits},{is_ad}\n')


In [39]:

# Set the random seed for reproducibility
np.random.seed(42)

# Define the batch size, the number of epochs, and the input file path
batch_size = 32
epochs = 100
input_file = '../lists/preprocessed.csv'

# Load the preprocessed features and the class label into a Pandas DataFrame
data = pd.read_csv(input_file, header=None, names=['url', 'has_ad', 'is_subdomain', 'num_dots', 'num_hyphens', 'num_digits', 'class'])
data

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,url,has_ad,is_subdomain,num_dots,num_hyphens,num_digits,class
0,url,has_ad,is_subdomain,num_dots,num_hyphens,num_digits,is_ad
1,google.com,0,0,1,0,0,0
2,youtube.com,0,0,1,0,0,0
3,facebook.com,0,0,1,0,0,0
4,amazonaws.com,0,0,1,0,0,0
...,...,...,...,...,...,...,...
1474709,slview.psne.jp,0,0,2,0,0,1
1474710,x.vipergirls.to,0,0,2,0,0,1
1474711,x0r.urlgalleries.net,0,0,2,0,1,1
1474712,yotta.scrolller.com,0,0,2,0,0,1


In [58]:

# Set the random seed for reproducibility
np.random.seed(42)

# Define the batch size, the number of epochs, and the input file path
batch_size = 32
epochs = 100
input_file = '../lists/preprocessed.csv'

# Load the preprocessed features and the class label into a Pandas DataFrame
#data = pd.read_csv(input_file, header=None, dtype={'url':str, 'has_ad':int, 'is_subdomain':int, 'num_dots':int, 'num_hyphens':int, 'num_digits':int, 'is_ad':int}, names=['url', 'has_ad', 'is_subdomain', 'num_dots', 'num_hyphens', 'num_digits', 'is_ad'])
data = pd.read_csv(input_file, header=None, names=['url', 'has_ad', 'is_subdomain', 'num_dots', 'num_hyphens', 'num_digits', 'class'])
data = data.dropna()
data = data.replace([np.inf, -np.inf, np.nan], 0)
data = data.astype({'has_ad': 'float32', 'is_subdomain': 'float32', 'num_dots': 'float32', 'num_hyphens': 'float32', 'num_digits': 'float32', 'class': 'float32'})
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['class'])



# Define the input shape of the neural network
input_shape = (train_data.shape[1] - 1,)

# Build the neural network model
model = Sequential()
model.add(Dense(64, input_shape=input_shape, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Define the early stopping criteria
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1], batch_size=batch_size, epochs=epochs, validation_data=(val_data.iloc[:, :-1], val_data.iloc[:, -1]), callbacks=[early_stop])

# Evaluate the model on the validation set
loss, acc = model.evaluate(val_data.iloc[:, :-1], val_data.iloc[:, -1], batch_size=batch_size)
print('Validation accuracy:', acc)


  super().__init__(name, **kwargs)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [66]:

# Set the random seed for reproducibility
np.random.seed(42)

# Define the batch size, the number of epochs, and the input file path
batch_size = 32
epochs = 100
input_file = '../lists/preprocessed.csv'

# Load the preprocessed features and the class label into a Pandas DataFrame
data = pd.read_csv(input_file, header=None, names=['url', 'has_ad', 'is_subdomain', 'num_dots', 'num_hyphens', 'num_digits', 'class'])
data = data.dropna()

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['class'])

# Define the input shape of the neural network
input_shape = (train_data.shape[1] - 2,)

# Build the neural network model
model = Sequential()
model.add(Dense(64, input_shape=input_shape, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# Define the early stopping criteria
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(train_data.iloc[:, 1:-1], train_data.iloc[:, -1], batch_size=batch_size, epochs=epochs, validation_data=(val_data.iloc[:, 1:-1], val_data.iloc[:, -1]), callbacks=[early_stop])

# Evaluate the model on the validation set
loss, acc = model.evaluate(val_data.iloc[:, 1:-1], val_data.iloc[:, -1], batch_size=batch_size)
print('Validation accuracy:', acc)

model.save('my_model.h5')


Epoch 1/100


  super().__init__(name, **kwargs)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Validation accuracy: 0.9945592880249023


In [68]:
    import numpy as np

    # Preprocess the input URL to extract the features
    url = 'www.deheredia.com'
    has_ad = 0
    is_subdomain = 0
    num_dots = url.count('.')
    num_hyphens = url.count('-')
    num_digits = sum(c.isdigit() for c in url)
    input_features = np.array([has_ad, is_subdomain, num_dots, num_hyphens, num_digits]).reshape(1, -1)

    # Use the trained model to make a prediction
    prediction = model.predict(input_features)[0][0]

    # Print the prediction
    if prediction > 0.5:
        print(f"The URL '{url}' is predicted to be an ad with a probability of {prediction:.2f}")
    else:
        print(f"The URL '{url}' is predicted to be not an ad with a probability of {1 - prediction:.2f}")


The URL 'www.deheredia.com' is predicted to be not an ad with a probability of 0.99


In [73]:
import numpy as np
import re
from tensorflow.keras.models import load_model

# Load the trained model from file
model = load_model('my_model.h5')

# Define a function to extract features from the URL
def extract_features(url):
    has_ad = 0
    is_subdomain = 0
    num_dots = url.count('.')
    num_hyphens = url.count('-')
    num_digits = sum(c.isdigit() for c in url)
    return np.array([has_ad, is_subdomain, num_dots, num_hyphens, num_digits]).reshape(1, -1)

# Define a function to check if a URL is valid
def is_valid_url(url):
    regex = re.compile(r'(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b')
    return regex.match(url) is not None

# Get the URL input from the user
url = input("Enter a URL (e.g. www.google.com): ")

# Validate the URL input
if not is_valid_url(url):
    print("Invalid URL.")
else:
    # Extract features from the URL
    input_features = extract_features(url)

    # Use the trained model to make a prediction
    prediction = model.predict(input_features)[0][0]

    # Print the prediction
    if prediction > 0.5:
        print(f"The URL '{url}' is predicted to be an ad with a probability of {prediction:.2f}")
    else:
        print(f"The URL '{url}' is predicted to be not an ad with a probability of {1 - prediction:.2f}")


Enter a URL (e.g. www.google.com): www.deheredia.com
The URL 'www.deheredia.com' is predicted to be not an ad with a probability of 0.99
