In [1]:
import csv
import fileinput
import re
from os.path import abspath

In [36]:
def append_to_txt(output_name, list_name):
    """Convert list to .txt"""
    with open(output_name, 'a') as file_handle:
        for list_item in list_name:
            file_handle.write('%s\n' % list_item)

def cleaner(*filepath, outputname):
    """Inplace removal of protocol (http://), prefix (www)
    and paths (/something.html) from list of urls.
    Ensure that is a .com
    """
    # Inplace removal of protocol, prefix, paths
    # also, backup file as name_bak
    for path in filepath:
        path = abspath(path)
        with fileinput.FileInput(
                path, inplace=True, backup='_bak') as file_handle:
            for line in file_handle:
                print(re.sub(r'(.*\/\/|www\.)|(\/.*)|(.*[@_!#$%^&*()<>?\|}{~:].*)', '', line), end='')
        # Read file into list
        domain_list = [line.rstrip('\n').lower() for line in open(path) if line]
        # Remove duplicates
        domain_list = set(domain_list)
        
        # Grab those that ends with .com and strip
        dotcom = [domain.strip('.com') for domain in domain_list if domain.endswith(".com")]
        # Grab those that does not have .
        dotcom = [domain for domain in dotcom if '.' not in domain]
        # Remove duplicates
        dotcom = list(set(dotcom))
        
        # Size checking
        print(
            'initial length: ' + str(len(domain_list)) + ' | ' +
            'after clean: ' + str(len(dotcom)) + ' | ' +
            'lost of: ' + str(len(domain_list) - len(dotcom)) +
            ' or {:.5}%'.format(str((len(domain_list) - len(dotcom)) / len(domain_list) * 100)))
        
        # Write captured domains name on file
        append_to_txt('./data/lists/cleaned/' + outputname, dotcom)

In [41]:
alexa = 'data/lists/legitimate/public_alexa_500k_found'
malwaredomain = 'data/lists/malicious/public_malwaredomains'
openphish = 'data/lists/malicious/public_openphish'
phishtank = 'data/lists/malicious/public_phishtank'

cleaner(alexa, outputname='legitimate_cleaned')
cleaner(malwaredomain, openphish, phishtank, outputname='malicious_cleaned')

initial length: 434737 | after clean: 202818 | lost of: 231919 or 53.34%
initial length: 26819 | after clean: 10362 | lost of: 16457 or 61.36%
initial length: 1400 | after clean: 418 | lost of: 982 or 70.14%
initial length: 5689 | after clean: 1896 | lost of: 3793 or 66.67%
