# Website Prediction - Revised Approach

## Method 1

In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup

#### Adding `http://` to the link that doesnt have it

In [2]:
def full_url(link):
    "Adding 'http://' to the link that doesnt have it"
    return link if "://" in link else "http://" + link

#### Getting links from text file

In [3]:
# Read the links from the text file
with open('../PredictWebsite/websites.txt') as f:
    links = f.read()

#### Variables used

In [4]:
# Adding the user-agent along with the request
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"}
# Site used to get the CMS used by a website
base_url = "http://whatcms.org/"
# Store the results obtained i.e., 'CMS Name' or 'np.nan'
results = []

#### To get the Content Management System (CMS) used by website. Using `http://whatcms.org` to get the CMS information

In [None]:
# Iterate through the links to get the results
for link in links.split('\n'):
    # Convert the normal link to full link e.g google.com --> http://google.com
    act_link = full_url(link)
    
    # Adding payload along with the get request
    payload = (('s', act_link), ('na', ''), ('nb', 'e93fa94402'), ('nc', '582ecab4a6358ecb0ea7fa8999e37ee1'))
    
    # Use the proxies, and auth parameters if behind proxy
    r = requests.get(base_url, proxies=proxyDict, auth=auth, headers=headers, params=payload)
    soup = BeautifulSoup(r.content, 'lxml')
    try:
        result = soup.find('div', {'id': 'wcresult'}).find('a').text
        results.append(result)
    except:
        results.append(np.nan)

#### The above method returns a list with the CMS used, but most of the values were nan. No promising results were obtained.

## Method 2

#### This method uses `BuildWith` site to get the infromation about the type of website. If it is an e-Commerce website it informs about the platform used for making it.

In [5]:
import requests
import urlparse
from bs4 import BeautifulSoup

#### Variables used

In [6]:
# Adding the user-agent along with the request
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"}

# Site used to get the CMS used by a website
base_url = "https://builtwith.com/"

#### Get the list of websites to find the results

In [7]:
# Read the links from the text file
with open('../PredictWebsite/websites.txt') as f:
    links = f.read()

In [8]:
session = requests.Session()
session.trust_env = False

#### Finding out the type of website and saving it in results_revised.txt

In [None]:
with open('../results/results_revised.txt', 'w') as outfile:
    
    # Open each link seperately from the variable
    for link in links.split('\n')[:-1]:
        result = 'False'
        
        # Remove '.' from trailing IP address
        if link[-1] == '.':
            link = link[:-1]
            
        # Adding base_url to the link
        response = session.get(urlparse.urljoin(base_url, link), headers=headers)
        # Creating a soup object
        soup = BeautifulSoup(response.content, 'html.parser')
        for item in soup.find_all('div', 'titleBox'):
            res = item.find('li', 'active')
            
            # Remove 'None' value
            if res:
                if 'Ecommerce' in res.text:
                    result = 'True'
                    break 
        # Writing it to the file
        outfile.write('{link}\t{result}\n'.format(link=link, result=result))

In [9]:
import pandas as pd

#### Open the 'results_revised' to find out the count

In [10]:
df = pd.read_csv('../results/results_revised.txt', sep='\t', names=['url', 'result'])

In [11]:
df.head()

Unnamed: 0,url,result
0,radiounumanele.com,False
1,garage-autodep.com,False
2,adbagency.com,False
3,timelesspiece.eu,False
4,watchesbestdeals.com,False


#### Total TRUE count

In [12]:
(df['result'] == True).sum()

104

#### Total False count

In [13]:
(df['result'] == False).sum()

396