### Author : Sanjoy Biswas
### Topic : Data Science For Security Analysis
### Email : sanjoy.eee32@gmail.com

## Identify files via file extensions

A regular expression to check for file extensions.  

Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest.

In [9]:
pattern = r'(?i)(\w+)\.(jpeg|jpg|png|gif|tif|svg)$'

# remove `(?i)` to make regexpr case-sensitive

str_true = ('test.gif', 
            'image.jpeg', 
            'image.jpg',
            'image.TIF'
            )

str_false = ('test.pdf',
             'test.gif.pdf',
             )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t
for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

<br>
<br>

## Username validation

Checking for a valid user name that has a certain minimum and maximum length.

Allowed characters:
- letters (upper- and lower-case)
- numbers
- dashes
- underscores

In [10]:
min_len = 5 # minimum length for a valid username
max_len = 15 # maximum length for a valid username

pattern = r"^(?i)[a-z0-9_-]{%s,%s}$" %(min_len, max_len)

# remove `(?i)` to only allow lower-case letters



str_true = ('user123', '123_user', 'Username')
            
str_false = ('user', 'username1234_is-way-too-long', 'user$34354')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t
for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

  from ipykernel import kernelapp as app


<br>
<br>

## Checking for valid email addresses

A regular expression that captures most email addresses.

In [11]:
pattern = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"

str_true = ('test@mail.com',)
            
str_false = ('testmail.com', '@testmail.com', 'test@mailcom')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t
for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Check for a valid URL

Checks for an URL if a string ...

- starts with `https://`, or `http://`, or `www.`
- or ends with a dot extension

In [12]:
pattern = '^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$'

str_true = ('https://github.com', 
            'http://github.com',
            'www.github.com',
            'github.com',
            'test.de',
            'https://github.com/rasbt',
            'test.jpeg' # !!! 
            )
            
str_false = ('testmailcom', 'http:testmailcom', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Checking for numbers

### Positive integers

In [13]:
pattern = '^\d+$'

str_true = ('123', '1', )
            
str_false = ('abc', '1.1', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### Negative integers

In [14]:
pattern = '^-\d+$'

str_true = ('-123', '-1', )
            
str_false = ('123', '-abc', '-1.1', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### All integers

In [15]:
pattern = '^-{0,1}\d+$'

str_true = ('-123', '-1', '1', '123',)
            
str_false = ('123.0', '-abc', '-1.1', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### Positive numbers

In [16]:
pattern = '^\d*\.{0,1}\d+$'

str_true = ('1', '123', '1.234', )
            
str_false = ('-abc', '-123', '-123.0')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### Negative numbers

In [17]:
pattern = '^-\d*\.{0,1}\d+$'

str_true = ('-1', '-123', '-123.0', )
            
str_false = ('-abc', '1', '123', '1.234', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### All numbers

In [19]:
pattern = '^-{0,1}\d*\.{0,1}\d+$'

str_true = ('1', '123', '1.234', '-123', '-123.0')
            
str_false = ('-abc')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Validating dates

Validates dates in `mm/dd/yyyy` format.

In [15]:
pattern = '^(0[1-9]|1[0-2])\/(0[1-9]|1\d|2\d|3[01])\/(19|20)\d{2}$'

str_true = ('01/08/2014', '12/30/2014', )
            
str_false = ('22/08/2014', '-123', '1/8/2014', '1/08/2014', '01/8/2014')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### 12-Hour format

In [20]:
pattern = r'^(1[012]|[1-9]):[0-5][0-9](\s)?(?i)(am|pm)$'

str_true = ('2:00pm', '7:30 AM', '12:05 am', )
            
str_false = ('22:00pm', '14:00', '3:12', '03:12pm', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

  


### 24-Hour format

In [21]:
pattern = r'^([0-1]{1}[0-9]{1}|20|21|22|23):[0-5]{1}[0-9]{1}$'

str_true = ('14:00', '00:30', )
            
str_false = ('22:00pm', '4:00', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t
for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Checking for HTML tags

In [16]:
pattern = r"""</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>"""

str_true = ('<a>', '<a href="something">', '</a>', '<img src>')
            
str_false = ('a>', '<a ', '< a >')

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Checking for IP addresses

# Ipv4

<font size="1px">Image source: http://en.wikipedia.org/wiki/File:Ipv4_address.svg</font>

In [8]:
pattern = r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'

str_true = ('172.16.254.1', '1.2.3.4', '01.102.103.104', )
            
str_false = ('17216.254.1', '1.2.3.4.5', '01 .102.103.104', )

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

### Ipv6

![](../Images/Ipv6_address.png)

In [21]:
pattern = r'^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$'

str_true = ('2001:470:9b36:1::2',
            '2001:cdba:0000:0000:0000:0000:3257:9652', 
            '2001:cdba:0:0:0:0:3257:9652', 
            '2001:cdba::3257:9652', )
            
str_false = ('1200::AB00:1234::2552:7777:1313', # uses `::` twice
             '1200:0000:AB00:1234:O000:2552:7777:1313', ) # contains an O instead of 0

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

## Checking for MAC addresses

![](../Images/MACaddressV3.png)

In [22]:
pattern = r'^(?i)([0-9A-F]{2}[:-]){5}([0-9A-F]{2})$'

str_true = ('94-AE-70-A0-66-83', 
            '58-f8-1a-00-44-c8',
            '00:A0:C9:14:C8:29'
            , )
            
str_false = ('0:00:00:00:00:00', 
             '94-AE-70-A0 -66-83', ) 

for t in str_true:
    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t

for f in str_false:
    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f

  if sys.path[0] == '':


In [23]:
def check_ssl(url):
    """Check if the ssl certificate is valid."""
    try:
        requests.get(url, verify=True, timeout=3)
        return True
    except Exception:
        return False
check_ssl('http://atecobois.fr/es/app.html')

False

In [2]:
def get_ptr(url):
    """Return PTR associated with IP."""
    try:
        if valid_ip(url['host']):
            ip = url['host']
        else:
            ip = resolver.query(url['host'], 'A')
            ip = ip[0].to_text()

        if ip:
            r = reversename.from_address(ip)
            result = resolver.query(r, 'PTR')[0].to_text()
            return result
        else:
            return 'True'
    except Exception:
        return 'False'
get_ptr('http://atecobois.fr/es/app.html')

'False'

In [3]:
def get_country(url):
    """Return the country associated with IP."""
    try:
        if valid_ip(url['host']):
            ip = url['host']
        else:
            ip = resolver.query(url['host'], 'A')
            ip = ip[0].to_text()

        if ip:
            reader = geoip2.database.Reader(PATH + 'GeoLite2-Country.mmdb')
            response = reader.country(ip)
            return response.country.iso_code
        else:
            return 'True'
    except Exception:
        return 'False'
get_country('http://atecobois.fr/es/app.html')

'False'

In [4]:
def get_asn_number(url):
    """Return the ANS number associated with the IP."""
    try:
        with geoip2.database.Reader(PATH + 'GeoLite2-ASN.mmdb') as reader:
            if valid_ip(url['host']):
                ip = url['host']
            else:
                ip = resolver.query(url['host'], 'A')
                ip = ip[0].to_text()

            if ip:
                response = reader.asn(ip)
                return response.autonomous_system_number
            else:
                return 'True'
    except Exception:
        return 'False'
get_asn_number('http://atecobois.fr/es/app.html')

'False'

In [5]:
def check_blacklists_ip(url):
    """Check if the IP is malicious through Google Safebrowsing, Phishtank and WOT."""
    try:
        if valid_ip(url['host']):
            ip = url['host']
        else:
            ip = resolver.query(url['host'], 'A')
            ip = ip[0].to_text()

        if ip:
            if (google_safebrowsing(ip) or phishtank(ip) or wot(ip)):
                return True
            return False
        else:
            return 'True'
    except Exception:
        return 'False'
check_blacklists_ip('http://atecobois.fr/es/app.html')

'False'

## Parsing Phone Number

In [None]:
import re
pattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$')
pattern.search('415-867-5309')
_sre.SRE_Match object at 0x02FCDD40>
pattern.search('415-867-5309').groups()
('415', '867', '5309')

# Extract Subdomain Form a URL

In [None]:
from tldextract import extract
 
urls_file = "urlhaus_url.csv"
#URLs should be in column A without a heading, in a CSV file named "urls_file.csv"
 
urls = [line.rstrip('\n') for line in open(urls_file, errors='ignore')]
lst=[] 
for url in urls:
    tsd, td, tsu = extract(url)
    url = tsd +'.' + td + '.' + tsu
    lst.append(url)

# Extract Domain From a URL

In [None]:
from tldextract import extract
 
urls_file = "urlhaus_url.csv"
#URLs should be in column A without a heading, in a CSV file named "urls_file.csv"
 
urls = [line.rstrip('\n') for line in open(urls_file, errors='ignore')]
lst=[] 
for url in urls:
    tsd, td, tsu = extract(url)
    url = td + '.' + tsu
    lst.append(url)

# Count Name Server From a URL

In [None]:
def count_name_servers(url):
    """Return number of NameServers (NS) resolved."""
    count = 0
    if count_ips(url):
        try:
            answers = resolver.query(url['host'], 'NS')
            return len(answers)
        except (resolver.NoAnswer, resolver.NXDOMAIN):
            split_host = url['host'].split('.')
            while len(split_host) > 0:
                split_host.pop(0)
                supposed_domain = '.'.join(split_host)
                try:
                    answers = resolver.query(supposed_domain, 'NS')
                    count = len(answers)
                    break
                except Exception:
                    count = 0
        except Exception:
            count = 0
    return count
count_name_servers('http://atecobois.fr/es/app.html')

# Time Activation of Domain

In [None]:
def time_activation_domain(url):
    """Return time (in days) of domain activation."""
    if url['host'].startswith("www."):
        url['host'] = url['host'][4:]

    pythonwhois.net.socket.setdefaulttimeout(3.0)
    try:
        result_whois = pythonwhois.get_whois(url['host'].lower())
        if not result_whois:
            return '?'
        creation_date = str(result_whois['creation_date'][0])
        formated_date = " ".join(creation_date.split()[:1])
        d1 = datetime.strptime(formated_date, "%Y-%m-%d")
        d2 = datetime.now()
        return abs((d2 - d1).days)
    except Exception:
        return '?'

Special Thanks to :
    1.https://sebastianraschka.com/
    2.towardsdatascience.com
    3.https://docs.python.org/3/library/re.html