In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

df = pd.read_csv('../Datasets/data_without_normalization.csv')

Test 1: Comparing the Distributions of URL Lengths between Fake vs Real URLS. (Using Paired T-Test)

Null Hypothesis : There is no difference in means of distributions of fake vs real URLs.

Alternative Hypothesis : There is a difference in means of distributions of fake vs real URLs.

In [2]:
from scipy.stats import ttest_rel, t

# Separating the fake samples from legitimate samples.
legitimate_df = df[df['label'] == 1]
fake_df = df[df['label'] == 0]

# Randomly sampling 5000 of each class.
sampled_legitimate_df = legitimate_df.sample(n = 5000, replace = False, random_state = 42)
sampled_fake_df = fake_df.sample(n = 5000, random_state = 42, replace = False)

legitimate_data = sampled_legitimate_df['URLLength'].to_numpy()
fake_data = sampled_fake_df['URLLength'].to_numpy()

# Performing the test.
t_statistic, p_value = ttest_rel(legitimate_data, fake_data)
print("test statistic:", t_statistic)

# Using 5% as our significance level.
alpha = 0.05

# Finding critical value for double-sided t-test.
deg_free = 4999
critical_value = t.ppf(1 - alpha/2, deg_free)
print("Critical value:", critical_value)

if t_statistic <= critical_value and t_statistic >= (-1) * critical_value:
    print("Fail to reject the null hypothesis: There is no significant difference between the means.")
else:
    print("Reject the null hypothesis: There is a significant difference between the means.")

test statistic: -37.76386453569706
Critical value: 1.9604386466615242
Reject the null hypothesis: There is a significant difference between the means.


Since test statistic is -ve, the fake URLs have significantly larger URL length.

------------------------------------------------------------------------------------------

Test 2 : Checking if there is a linear relationship between number of digits and letters in a URL.

Null Hypothesis : There isn't a linear relationship.

Alternative Hypothesis : There is a linear relationship.

In [3]:
from scipy.stats import pearsonr

# Randomly sample 10000 data points.
sampled_df = df.sample(n = 10000, replace = False, random_state = 42)

letter_data = sampled_df['NoOfLettersInURL'].to_numpy()
digit_data = sampled_df['NoOfDegitsInURL'].to_numpy()

r, p_value = pearsonr(letter_data, digit_data)

# Calculating the test statistics
n = 10000
t_statistic = r * np.sqrt((n - 2) / (1 - r**2))

# Using 5% significance level.
alpha = 0.05
deg_free = n - 2
critical_value = t.ppf(1 - alpha / 2, deg_free)

print("Correlation coefficient (r):", r)
print("t-statistic:", t_statistic)
print("Critical value", critical_value)

if abs(t_statistic) > critical_value:
    print("Reject the null hypothesis: Correlation is statistically significant.")
else:
    print("Fail to reject the null hypothesis: Correlation is not statistically significant.")

Correlation coefficient (r): 0.46966500257238347
t-statistic: 53.193730152631524
Critical value 1.9602012873568364
Reject the null hypothesis: Correlation is statistically significant.


So, there is a linear relationship between number of digits and letter in a URL. They are positively correlated.

----------------------------------------------------------------------------------------------------------------------------

Test 3 : Checking if there is a linear relationship between URL length and Domain Length.

Null Hypothesis : There isn't a linear relationship.

Alternative Hypothesis : There is a linear relationship.

In [4]:
# Randomly sample 10000 data points.
sampled_df = df.sample(n = 10000, replace = False, random_state = 44)

url_data = sampled_df['URLLength'].to_numpy()
domain_data = sampled_df['DomainLength'].to_numpy()

r, p_value = pearsonr(url_data, domain_data)

# Calculating the test statistics
n = 10000
t_statistic = r * np.sqrt((n - 2) / (1 - r**2))

# Using 5% significance level.
alpha = 0.05
deg_free = n - 2
critical_value = t.ppf(1 - alpha / 2, deg_free)

print("Correlation coefficient (r):", r)
print("t-statistic:", t_statistic)
print("Critical value", critical_value)

if abs(t_statistic) > critical_value:
    print("Reject the null hypothesis: Correlation is statistically significant.")
else:
    print("Fail to reject the null hypothesis: Correlation is not statistically significant.")

Correlation coefficient (r): 0.4411837264473263
t-statistic: 49.156601933693764
Critical value 1.9602012873568364
Reject the null hypothesis: Correlation is statistically significant.


So, there is a linear relationship between the URL and Domain Length. They are positively correlated.

-------------------------------------------------------------------------------------------------------------------

Test 4 : Checking if there is a relationship between having a title and the URL being fake or real.

Null Hypothesis : They are independent.

Alternative Hypothesis : They are dependent.

In [5]:
import scipy.stats  as stats

# Randomly sample 10000 data points.
sampled_df = df.sample(n = 10000, replace = False, random_state = 43)

# Creating the table
table = pd.crosstab(sampled_df['HasTitle'], sampled_df['label'])
print("Table:\n", table)

# Performing the test.
chi2, p, deg_free, expected_freq = stats.chi2_contingency(table)

print("\nChi-squared Statistic:", chi2)
print("Degrees of Freedom:", deg_free)

# Using 5% level of significance.
alpha = 0.05

# Finding critical value
critical_value_right = stats.chi2.ppf(1 - alpha/2, deg_free)
critical_value_left = stats.chi2.ppf(alpha/2, deg_free)

print(f"\nCritical value for chi-squared distribution:\nLeft: {critical_value_left}    Right: {critical_value_right}")

if chi2 > critical_value_right or chi2 < critical_value_left:
    print("Reject the null hypothesis: There is a significant association between the variables.")
else:
    print("Fail to reject the null hypothesis: No significant association between the variables.")


Table:
 label        0     1
HasTitle            
0         1611     8
1         3372  5009

Chi-squared Statistic: 1904.4377424094232
Degrees of Freedom: 1

Critical value for chi-squared distribution:
Left: 0.0009820691171752557    Right: 5.023886187314888
Reject the null hypothesis: There is a significant association between the variables.


So, the properties of 'having a title' and 'being a fake or real URL' are dependent.

---------------------------------------------------------------------------------------------------------

Test 5 : Checking if there is a relationship between being a HTTPS link and the URL being fake or real.

Null Hypothesis : They are independent.

Alternative Hypothesis : They are dependent.

In [6]:
# Randomly sample 10000 data points.
sampled_df = df.sample(n = 10000, replace = False, random_state = 45)

# Creating the table
table = pd.crosstab(sampled_df['IsHTTPS'], sampled_df['label'])
print("Table:\n", table)

# Performing the test.
chi2, p, deg_free, expected_freq = stats.chi2_contingency(table)

print("\nChi-squared Statistic:", chi2)
print("Degrees of Freedom:", deg_free)

# Using 5% level of significance.
alpha = 0.05

# Finding critical value
critical_value_right = stats.chi2.ppf(1 - alpha/2, deg_free)
critical_value_left = stats.chi2.ppf(alpha/2, deg_free)

print(f"\nCritical value for chi-squared distribution:\nLeft: {critical_value_left}    Right: {critical_value_right}")

if chi2 > critical_value_right or chi2 < critical_value_left:
    print("Reject the null hypothesis: There is a significant association between the variables.")
else:
    print("Fail to reject the null hypothesis: No significant association between the variables.")

Table:
 label       0     1
IsHTTPS            
0        2577     0
1        2316  5107

Chi-squared Statistic: 3620.725035244844
Degrees of Freedom: 1

Critical value for chi-squared distribution:
Left: 0.0009820691171752557    Right: 5.023886187314888
Reject the null hypothesis: There is a significant association between the variables.


The properties of 'being a HTTPS URL' and 'being a fake/legitimate URL' are dependent.