## BULK WHOIS AND IP ADDRESS SCRAPER

Investigative journalists often want to find information about masses of websites, for example to find out if gambling sites are clustered in certain locations. 

This script lets you load an infinite list of address (into the 'checklist' area) and returns WhoIs and IP address info into a dataframe. 


In [205]:
# libraries
import requests
from bs4 import BeautifulSoup, Tag
import numpy as np
import matplotlib.pyplot as plt
import re
import pandas as pd

In [206]:
# insert links here

checklist = ['https://pandas.pydata.org/',\
            'https://www.1bet2u.cc/',\
            'https://www.cashpoint.com/']


In [207]:
### WHOIS finder

def whois(site):
    
    whod = {}
    
    url = "https://www.whois.com/whois/" + s
    page = requests.get(url)                                
    soup = BeautifulSoup(page.content, 'html.parser')
    
    print('WHOIS: ', url)
    
    j = soup.find("div", {"class": "df-block-raw"}).contents[1].get_text()
    
    j = j.split('>>> Last update')[0]
    
    j = j.split('\n')
    
    for i in j:
        
        try:
            k,v = i.split(':',1)
            v = v.replace('\r','').strip()
            whod[k] = v
                
        except:
            continue
            
    d[s]['WhoIs'] = whod

In [208]:
def ipaddress(site,s):
    
    ipd = {}
    
    url = "https://check-host.net/ip-info?host=" + site
    page = requests.get(url)                                
    soup = BeautifulSoup(page.content, 'html.parser')
    
    tables = soup.find_all('table', attrs={'class':'inside_info'})

    i = 1
    for table in tables:
        
        ip_x = 'IP_'+ str(i)
        #ipd[ip_x] = {}
        i += 1
        
        rows = table.find_all('tr')

        for row in rows:
            
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]

            if len(str(cols)) < 100:
                try:

                    if cols[0] == 'IP address' or cols[0] == 'Country'  or cols[0] == 'Region'  or cols[0] == 'City':
                        
                        k = cols[0] 
                        
                        if cols[0] == 'Country':
                            v = cols[1].split('\n')[1]
                            v = v.replace('(','').replace(')','')
                        else:
                            v = cols[1]
                        
                        ipd[ip_x +': ' + k] = v
                        
                except:
                    continue
                    
    d[s]['IPs'] = ipd

In [168]:
# mini version for test purposes

#checklist = checklist[30:]

In [209]:
d = {}

for site in checklist:
    
    s = site.replace('https','').replace('http','').replace('://','').replace('www.','').replace('/','')
    d[s] = {}
    
    ipaddress(site,s)
    whois(s)

WHOIS:  https://www.whois.com/whois/pandas.pydata.org
WHOIS:  https://www.whois.com/whois/1bet2u.cc
WHOIS:  https://www.whois.com/whois/cashpoint.com


In [None]:
### NOTE: IF THIS STEP GIVES AN ERROR MESSAGE, IT MAY A WHOIS ISSUE. 

# Solution: click the link of the last URL and show you are not a robot

In [211]:
refined = {}
for a in d:
    refined[a] = {}
    for b in d[a]:
        for k,v in d[a][b].items():
            refined[a][k] = v
    
df = pd.DataFrame(refined)

df

Unnamed: 0,pandas.pydata.org,1bet2u.cc,cashpoint.com
IP_1: IP address,104.26.1.204,99.86.90.54,99.86.90.113
IP_1: Country,CA,FR,FR
IP_1: Region,Quebec,Île-de-France,Île-de-France
IP_1: City,Montreal,Paris,Paris
IP_2: IP address,104.26.1.204,99.86.90.54,99.86.90.113
IP_2: Country,US,FR,FR
IP_2: Region,California,Ile-de-France,Ile-de-France
IP_2: City,San Francisco,Paris,Paris
IP_3: IP address,104.26.1.204,99.86.90.54,99.86.90.113
IP_3: Country,US,US,US


In [182]:
# pickle

#df.to_pickle("./pickledinfo.pkl")

## SEPERATED VERSIONS

In [212]:
d = {}

for site in checklist:
    
    print('.', end='')
    
    s = site.replace('https','').replace('http','').replace('://','').replace('www.','').replace('/','')
    d[s] = {}
    
    url = "https://check-host.net/ip-info?host=" + site
    page = requests.get(url)                                
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #tables = soup.find_all('table')

    tables = soup.find_all('table', attrs={'class':'inside_info'})

    i = 1
    for table in tables:
        
        ip_x = 'IP_'+ str(i)
        d[s][ip_x] = {}
        i += 1
        
        rows = table.find_all('tr')

        for row in rows:
            
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]

            if len(str(cols)) < 100:
                try:

                    if cols[0] == 'IP address' or cols[0] == 'Country'  or cols[0] == 'Region'  or cols[0] == 'City':
                        
                        k = cols[0] 
                        
                        if cols[0] == 'Country':
                            v = cols[1].split('\n')[1]
                            v = v.replace('(','').replace(')','')
                        else:
                            v = cols[1]
                        
                        d[s][ip_x][k] = v
                        
                except:
                    continue
                    
df = pd.DataFrame.from_dict({(i,j): d[i][j] 
                           for i in d.keys() 
                           for j in d[i].keys()},
                       ).T

pd.set_option('display.max_rows', None)

df

...

Unnamed: 0,Unnamed: 1,IP address,Country,Region,City
pandas.pydata.org,IP_1,104.26.1.204,CA,Quebec,Montreal
pandas.pydata.org,IP_2,104.26.1.204,US,California,San Francisco
pandas.pydata.org,IP_3,104.26.1.204,US,,
1bet2u.cc,IP_1,99.86.90.54,FR,Île-de-France,Paris
1bet2u.cc,IP_2,99.86.90.54,FR,Ile-de-France,Paris
1bet2u.cc,IP_3,99.86.90.54,US,,
cashpoint.com,IP_1,99.86.90.113,FR,Île-de-France,Paris
cashpoint.com,IP_2,99.86.90.113,FR,Ile-de-France,Paris
cashpoint.com,IP_3,99.86.90.113,US,,


In [213]:
d = {}

for site in checklist:
    
    whod = {}
    
    print('.', end='')
    
    s = site.replace('https','').replace('http','').replace('://','').replace('www.','').replace('/','')
    d[s] = {}
    
    url = "https://www.whois.com/whois/" + s
    page = requests.get(url)                                
    soup = BeautifulSoup(page.content, 'html.parser')
    
    print(url)
     
    j = soup.find("div", {"class": "df-block-raw"}).contents[1].get_text()
    
    j = j.split('>>> Last update')[0]
    
    j = j.split('\n')
    
    for i in j:
        
        try:
            k,v = i.split(':',1)
            v = v.replace('\r','').strip()
            whod[k] = v
                
        except:
            continue
            
            
    d[s] = whod
            
    #for k,v in whod.items():
       # print(k,'\t\t',v)
    

dfWho = pd.DataFrame(d)

dfWho
     

.https://www.whois.com/whois/pandas.pydata.org
.https://www.whois.com/whois/1bet2u.cc
.https://www.whois.com/whois/cashpoint.com


Unnamed: 0,pandas.pydata.org,1bet2u.cc,cashpoint.com
Domain Name,PYDATA.ORG,1bet2u.cc,cashpoint.com
Registry Domain ID,D164141878-LROR,131316734_DOMAIN_CC-VRSN,1812607_DOMAIN_COM-VRSN
Registrar WHOIS Server,whois.namecheap.com,whois.godaddy.com,whois.udag.net
Registrar URL,http://www.namecheap.com,http://www.godaddy.com,https://www.united-domains.de/
Updated Date,2019-11-26T04:18:35Z,2019-09-06T06:27:59Z,2020-10-01T14:45:14Z
Creation Date,2011-12-16T18:38:18Z,2017-09-05T04:10:30Z,1996-04-13T04:00:00Z
Registry Expiry Date,2028-12-16T18:38:18Z,,
Registrar Registration Expiration Date,,2021-09-05T04:10:30Z,2027-04-14T04:00:00Z
Registrar,"NameCheap, Inc.","GoDaddy.com, LLC",united domains AG
Registrar IANA ID,1068,146,1408


-----