In [35]:
# Importar las librerias necesarias
import pandas as pd

In [36]:
# Cargue el dataset en un dataframe de pandas, muestre un ejemplo de cinco observaciones.
data = pd.read_csv("dataset_pishing.csv")

# Hacemos el describe para ver la distribución de los datos
print(data.describe())

# Hacemos el head para ver las primeras 5 filas
print(data.head())

                                                      url      status
count                                               11430       11430
unique                                              11429           2
top     http://e710z0ear.du.r.appspot.com/c:/users/use...  legitimate
freq                                                    2        5715
                                                 url      status
0              http://www.crestonwood.com/router.php  legitimate
1  http://shadetreetechnology.com/V4/validation/a...    phishing
2  https://support-appleld.com.secureupdate.duila...    phishing
3                                 http://rgipt.ac.in  legitimate
4  http://www.iracing.com/tracks/gateway-motorspo...  legitimate


In [37]:
# Muestre la cantidad de observaciones etiquetadas en la columna status como “legit” y como “pishing”. ¿Está balanceado el dataset?

print(data["status"].value_counts())

status
legitimate    5715
phishing      5715
Name: count, dtype: int64


### Derivación de características 
En base a los artículos propuestos de clasificación de phishing, responda las siguientes preguntas: 
1. ¿Qué ventajas tiene el análisis de una URL contra el análisis de otros datos, cómo el tiempo 
de vida del dominio, o las características de la página Web? 
2. ¿Qué características de una URL son más prometedoras para la detección de phishing?

### 1. Ventajas del Análisis de URL contra Otros Datos
Eficiencia y Accesibilidad: El análisis de URL permite una evaluación rápida y en tiempo real sin cargar la página, facilitando la detección preliminar de amenazas de forma eficiente.
Consistencia de Datos: Las URLs proporcionan información estática y directa, menos susceptible a cambios y manipulaciones, en contraste con el contenido dinámico de las páginas web que puede ser más fácilmente alterado para evadir la detección.

### 2. Características de una URL más Prometedoras para la Detección de Phishing
Dominios Mal Escritos y Uso de Subdominios: Las variaciones ortográficas sutiles en dominios y el uso excesivo de subdominios intentan imitar sitios legítimos para engañar a los usuarios, siendo indicativos clave de phishing.
Longitud de la URL y Presencia de Palabras Clave Sensibles: Las URLs anormalmente largas y la inclusión de términos relacionados con la seguridad o transacciones financieras pueden señalar intentos de phishing, buscando simular urgencia o legitimidad.

In [38]:
# Las URLs se pueden ver de la siguiente manera:

# http://www.crestonwood.com/router.php
# http://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4
# https://support-appleld.com.secureupdate.duilawyeryork.com/ap/89e6a3b4b063b8d/?cmd=_update&dispatch=89e6a3b4b063b8d1b&locale=_
# http://rgipt.ac.in
# http://www.iracing.com/tracks/gateway-motorsports-park/
# http://appleid.apple.com-app.es/
# http://www.mutuo.it
# http://www.shadetreetechnology.com/V4/validation/ba4b8bddd7958ecb8772c836c2969531
# http://vamoaestudiarmedicina.blogspot.com/
# https://parade.com/425836/joshwigler/the-amazing-race-host-phil-keoghan-previews-the-season-27-premiere/
# https://www.astrologyonline.eu/Astro_MemoNew/Profilo.asp

# Lo que quisiera es en mi dataframe agregar una columna que me diga que protocolo es, http o https y agregar una columna que me diga el dominio de la URL.

# Para esto, vamos a usar la libreria urllib.parse

from urllib.parse import urlparse

# Vamos a hacer una función que nos permita obtener el protocolo y el dominio de la URL

def get_protocol_and_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme, parsed_url.netloc

# Ahora vamos a aplicar la función a la columna de URLs

data["protocol"], data["domain"] = zip(*data["url"].map(get_protocol_and_domain))

# Ahora vamos a ver como quedo el dataframe

# print(data.head())

# Ahora de acuerdo a los dominios que pueden ser:

# www.crestonwood.com  
# shadetreetechnology.com  
# support-appleld.com.secureupdate.duilawyeryork.com
# rgipt.ac.in  
# www.iracing.com 

# Creamos una columna con sus subdomains si es que los tienen

# Creamos una columna con second level domain si es que los tienen

# Creamos una columna con el top level domain


def get_subdomain_and_sld_and_tld(domain):

    domain_parts = domain.split(".")
    if len(domain_parts) == 2:
        return None, domain_parts[0], domain_parts[1]
    elif len(domain_parts) == 3:
        return domain_parts[0], domain_parts[1], domain_parts[2]
    else:
        return domain_parts[0], domain_parts[1], domain_parts[2]
    
data["subdomain"], data["secondleveldomain"], data["topleveldomain"] = zip(*data["domain"].map(get_subdomain_and_sld_and_tld))

# Ahora vamos a ver como quedo el dataframe

print(data.head())

                                                 url      status protocol  \
0              http://www.crestonwood.com/router.php  legitimate     http   
1  http://shadetreetechnology.com/V4/validation/a...    phishing     http   
2  https://support-appleld.com.secureupdate.duila...    phishing    https   
3                                 http://rgipt.ac.in  legitimate     http   
4  http://www.iracing.com/tracks/gateway-motorspo...  legitimate     http   

                                              domain        subdomain  \
0                                www.crestonwood.com              www   
1                            shadetreetechnology.com             None   
2  support-appleld.com.secureupdate.duilawyeryork...  support-appleld   
3                                        rgipt.ac.in            rgipt   
4                                    www.iracing.com              www   

     secondleveldomain topleveldomain  
0          crestonwood            com  
1  shadetreetechno

In [39]:
def url_length(url):
    """Check if the URL length is suspiciously long."""
    return len(url) > 75

def has_https(url):
    """Check if the URL uses HTTPS."""
    return url.startswith('https://')

def suspicious_tld(url):
    """Check for suspicious Top-Level Domains."""
    suspicious_domains = ['.biz', '.info', '.top', '.xyz']
    return any(url.endswith(domain) for domain in suspicious_domains)

def subdomain_count(url):
    """Count the number of subdomains."""
    domain = url.split("//")[-1].split("/")[0]
    return domain.count('.') - (1 if 'www.' not in domain else 0)

def contains_ip_address(url):
    """Check if the URL contains an IP address instead of a domain."""
    import re
    ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
    return bool(re.search(ip_pattern, url))

def path_length(url):
    """Check if the URL path is suspiciously long."""
    path = url.split('//')[-1].split('/')[1:]
    return len("/".join(path)) > 75

def contains_at_symbol(url):
    """Check if '@' symbol is present in the URL."""
    return "@" in url

def special_characters_count(url):
    """Count the number of special characters."""
    import re
    return len(re.findall(r'\W', url)) - url.count('/') - url.count('.')

def https_in_domain(url):
    """Check for 'https' in the domain name."""
    domain = url.split("//")[-1].split("/")[0]
    return 'https' in domain and not domain.startswith('https')

def sensitive_words(url):
    """Check for sensitive words in the URL."""
    words = ['login', 'verify', 'bank']
    return any(word in url for word in words)

def domain_age_check(url):
    """Stub for checking domain age, requires external API."""
    return False

def dot_count(url):
    """Count the number of dots in the URL."""
    return url.count('.')

def port_in_url(url):
    """Check if a port is specified in the URL."""
    import re
    port_pattern = r':\d+'
    return bool(re.search(port_pattern, url.split('/')[2] if '://' in url else url))

def file_extension_check(url):
    """Check for suspicious file extensions."""
    suspicious_extensions = ['.exe', '.zip', '.rar']
    return any(url.endswith(ext) for ext in suspicious_extensions)

def brand_name_in_domain(url):
    """Stub for checking brand names in domain, requires list of brand names."""
    return False



# Por cada una de las urls en el dataframe vamos a aplicar las funciones anteriores y vamos a guardar los resultados en un diccionario y posteriormente en columnas del mismo dataframe.

results = []

for i in range(len(data)):

    fila = data.iloc[i]
    test_url = fila["url"]

    # Test functions
    result = {
        "url_length_gt_75": url_length(test_url),
        "has_https": has_https(test_url),
        "suspicious_tld": suspicious_tld(test_url),
        "subdomain_count": subdomain_count(test_url),
        "contains_ip_address": contains_ip_address(test_url),
        "path_length": path_length(test_url),
        "contains_at_symbol": contains_at_symbol(test_url),
        "special_characters_count": special_characters_count(test_url),
        "https_in_domain": https_in_domain(test_url),
        "sensitive_words": sensitive_words(test_url),
        "domain_age_check": domain_age_check(test_url),
        "dot_count": dot_count(test_url),
        "port_in_url": port_in_url(test_url),
        "file_extension_check": file_extension_check(test_url),
        "brand_name_in_domain": brand_name_in_domain(test_url),
    }

    # Convertir valores booleanos a int para evitar el FutureWarning
    result = {k: int(v) if isinstance(v, bool) else v for k, v in result.items()}

    # Actualizar la fila correspondiente en el DataFrame con los resultados
    for key, value in result.items():
        data.at[i, key] = value  # Cambié 'loc' a 'at' para asignación directa, aunque 'loc' también es correcto.


# Ahora vamos a ver como quedo el dataframe
        
data.head()


Unnamed: 0,url,status,protocol,domain,subdomain,secondleveldomain,topleveldomain,url_length_gt_75,has_https,suspicious_tld,...,path_length,contains_at_symbol,special_characters_count,https_in_domain,sensitive_words,domain_age_check,dot_count,port_in_url,file_extension_check,brand_name_in_domain
0,http://www.crestonwood.com/router.php,legitimate,http,www.crestonwood.com,www,crestonwood,com,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
1,http://shadetreetechnology.com/V4/validation/a...,phishing,http,shadetreetechnology.com,,shadetreetechnology,com,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,https://support-appleld.com.secureupdate.duila...,phishing,https,support-appleld.com.secureupdate.duilawyeryork...,support-appleld,com,secureupdate,1.0,1.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
3,http://rgipt.ac.in,legitimate,http,rgipt.ac.in,rgipt,ac,in,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,http://www.iracing.com/tracks/gateway-motorspo...,legitimate,http,www.iracing.com,www,iracing,com,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [40]:
def converted_status(status):
    if status.lower() == "phishing":
        return 1
    else:
        return 0

data["binary_status"] = data["status"].apply(converted_status)

# Ahora vamos a ver como quedo el dataframe

data.head()

Unnamed: 0,url,status,protocol,domain,subdomain,secondleveldomain,topleveldomain,url_length_gt_75,has_https,suspicious_tld,...,contains_at_symbol,special_characters_count,https_in_domain,sensitive_words,domain_age_check,dot_count,port_in_url,file_extension_check,brand_name_in_domain,binary_status
0,http://www.crestonwood.com/router.php,legitimate,http,www.crestonwood.com,www,crestonwood,com,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0
1,http://shadetreetechnology.com/V4/validation/a...,phishing,http,shadetreetechnology.com,,shadetreetechnology,com,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,https://support-appleld.com.secureupdate.duila...,phishing,https,support-appleld.com.secureupdate.duilawyeryork...,support-appleld,com,secureupdate,1.0,1.0,0.0,...,0.0,8.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1
3,http://rgipt.ac.in,legitimate,http,rgipt.ac.in,rgipt,ac,in,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0
4,http://www.iracing.com/tracks/gateway-motorspo...,legitimate,http,www.iracing.com,www,iracing,com,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0


### Visualización de resultados 

Genere un reporte de perfil con la librería pandas_profiling. Analice el reporte y determine las 
columnas que son constantes, o que no tienen una varianza alta con la columna status. Almacene su 
reporte como una página html.

In [43]:
import numpy as np

df = data

# Identificar columnas constantes
constant_columns = [col for col in df.columns if df[col].nunique() == 1]

# Preparar la columna 'status' para análisis cuantitativo
df['status_numeric'] = df['status'].apply(lambda x: 1 if x == 'phishing' else 0)

# Seleccionar solo columnas numéricas para el cálculo de correlación
numeric_cols = df.select_dtypes(include=[np.number])

# Calcular la correlación de las columnas numéricas con 'status_numeric'
correlation_with_status = numeric_cols.corr()['status_numeric'].abs().sort_values()

# Filtrar columnas con correlación baja (ejemplo, menor a 0.1) con 'status_numeric'
low_variance_with_status = correlation_with_status[correlation_with_status < 0.1].index.tolist()

# Remover 'status_numeric' si está en la lista
if 'status_numeric' in low_variance_with_status:
    low_variance_with_status.remove('status_numeric')

# Generar el reporte
report_data = pd.DataFrame({
    "Constant Columns": constant_columns,
    "Columns with Low Variance to Status": [col for col in low_variance_with_status if col != 'status_numeric']
}, index=[0])

# Guardar el reporte como HTML
report_file_path = "url_analysis_report.html"
report_data.to_html(report_file_path, index=False)

print("Reporte guardado en:", report_file_path)

NameError: name 'np' is not defined