In [1]:
#Importing Dependencies
import requests
from urllib.parse import urljoin, urlunsplit, urlparse
import bs4
from bs4 import BeautifulSoup
from bs4.element import Comment

from collections import Counter
from string import punctuation
import csv
import pandas as pd
import numpy as np


import newspaper
from newspaper import Article
from newspaper import fulltext
import requests


#Tokenization Of Sentences
import nltk
# nltk.download('punkt')

#Readability Scores
import textstat




In [2]:
def get_metadata(soup):
    #Get Title Tag Of Page
    title_text = soup.title.getText()
    #Title Tag Length In Characters
    title_tag_length = len(title_text)
    # First get the meta description tag
    description = soup.find('meta', attrs={'name':'og:description'}) or soup.find('meta', attrs={'property':'description'}) or soup.find('meta', attrs={'name':'description'})
    # If description meta tag was found, then get the content attribute and save it to db entry
    if description:
        meta_description = description.get('content') 
    return title_text, title_tag_length, meta_description


def technical_page_metrics(soup, req, text):
    #Page_Size_In_Bytes
    page_size_in_bytes = len(req.content)

    #Plain_text_size
    plain_text_size = len(text)

    #plain_text_rate --> plaintext rate value (plain_text_size / page_size)
    plain_text_rate = (plain_text_size / page_size_in_bytes) * 100
    
    #Encoding 
    encoding = req.encoding
    
    #Detecting SSL Encryption
    if 's' in req.url:
        SSL = True
    else:
        SSL = False
        
    return page_size_in_bytes, plain_text_size, plain_text_rate, encoding, SSL

In [3]:
from tqdm import tqdm, tqdm_notebook

In [4]:
urls = ['https://www.theguardian.com/world/2019/aug/12/ebola-now-curable-after-trials-of-drugs-in-drc-say-scientists',
'https://edition.cnn.com/2019/08/12/politics/legal-immigration-public-charge/index.html']


def Article_scraper(urls):
        
    master_dict = {

            'HTML_Content': [],
            'Full_Text': [],
        
            #### Article Information 
            'Authors': [],
            'Publish_Date': [],
            'Article_Text': [],
            'Article_Text_Length': [],
            'Has_Top_Image': [],
            'Number_of_Movies': [],
            'Article_Is_Media_News': [],
            'Website_Has_Favicon': [],
            'Number_Of_Images':[],
            'Is_Valid_Body': [],

#           #### NLP Features
            'Setences_Text': [],
            'Number_Of_Sentences': [],
            'Lexicon_Count': [],
    
#           #### Readability Scores
            'Flesch_Reading_Ease_formula': [],
            'Flesch_Kincaid_Grade_Level': [],
            'FOG_Scale': [],
            'SMOG_Index': [],
            'ARI_Index': [],

        
#           #### Meta_Data
            'Title_Text': [],
            'Title_Tag_Length' : [],
            'Meta_Description' : [],
            'Meta_Description_Length': [],
        
        
#           #### Extract_page_features
            'Body_Content_Links': [],
            'Number_Of_Links': [],
            'Links_To_Text_Ratio': [],
        
        
            ### Technical Page Metrics
             'Page_Size_In_Bytes': [],
             'Plain_Text_Size': [],
             'Plain_Text_Rate': [],
             'Encoding': [],
             'Status_Code': []

    }
    
    
    for url in tqdm_notebook(urls):
        article = Article(url)
        article.download()
        article.parse()
        
        #Loading Requests
        req = requests.get(url)
        status_code = req.status_code
        
        #Scraping Article Metrics
        author = article.authors
        publish_date = article.publish_date
        html = req.text
        text = fulltext(html)
        article_text = article.text
        article_top_image = len(article.top_image)
        article_movies = len(article.movies)
        article_is_media_news = article.is_media_news()
        favicon = article.meta_favicon
        has_top_image = article.has_top_image()
        number_of_images = len(article.images)
        is_valid_body = article.is_valid_body()
        
        #Article NLP
        article.nlp()
        main_keywords = article.keywords
        lexicon_count = textstat.lexicon_count(article_text, removepunct=True)
        
        #Extracting Sentences
        sentences = nltk.sent_tokenize(article_text)
        number_of_sentences = len(sentences)
        
        
        #Extract Article Readability Scores
        Flesch_Reading_Ease_formula = textstat.flesch_reading_ease(article_text)
        Flesch_Kincaid_Grade_Level = textstat.flesch_kincaid_grade(article_text)
        FOG_Scale = textstat.gunning_fog(article_text)
        SMOG_Index = textstat.smog_index(article_text)
        ARI_Index = textstat.automated_readability_index(article_text)
        
        
        #BeautifulSoup Extraction
        soup = BeautifulSoup(html)
        #1. Get Meta_Data
        meta_data = get_metadata(soup)
        
        #2. Extract Text Metrics From Article Text
        body_links , number_of_links,  Links_To_Text_Ratio = get_article_links(article_text, soup)

        #3. Extract Technical Page Metrics
        page_size_in_bytes, plain_text_size, plain_text_rate, encoding, SSL = technical_page_metrics(soup, req, html)
        

        #### Dictionary Inserts #####
        master_dict['HTML_Content'].append(html)
        master_dict['Full_Text'].append(text)

        master_dict['Authors'].append(author)
        master_dict['Publish_Date'].append(publish_date)
        master_dict['Article_Text'].append(article_text) 
        master_dict['Article_Text_Length'].append(len(article_text))
        
        if has_top_image == True:
            master_dict['Has_Top_Image'] .append(1)
        else:
            master_dict['Has_Top_Image'].append(np.nan)

        if article_movies != 0:
            master_dict['Number_of_Movies'].append(article_movies)
        else:
            master_dict['Number_of_Movies'].append(0)

        if article_is_media_news == True:
            master_dict['Article_Is_Media_News'].append(1)
        else:
            master_dict['Article_Is_Media_News'].append(0)

        if favicon == True:
            master_dict['Website_Has_Favicon'] .append(1)
        else:
            master_dict['Website_Has_Favicon'] .append(0)
            
        if number_of_images != 0:
            master_dict['Number_Of_Images'].append(number_of_images)
        else:
            master_dict['Number_Of_Images'].append(0)

        if is_valid_body == True:
            master_dict['Is_Valid_Body'].append(1)
        else: 
            master_dict['Is_Valid_Body'].append(0)
        
        ### NLP Features
        
        master_dict['Setences_Text'].append(sentences)
        master_dict['Number_Of_Sentences'].append(number_of_sentences)
        master_dict['Lexicon_Count'].append(lexicon_count)
        
        ### Readability Scores   
        master_dict['Flesch_Reading_Ease_formula'].append(Flesch_Reading_Ease_formula)
        master_dict['Flesch_Kincaid_Grade_Level'].append(Flesch_Kincaid_Grade_Level)
        master_dict['FOG_Scale'].append(FOG_Scale)
        master_dict['SMOG_Index'].append(SMOG_Index)
        master_dict['ARI_Index'].append(ARI_Index)

        master_dict['Title_Text'].append(meta_data[0])
        master_dict['Title_Tag_Length'].append(meta_data[1])
        master_dict['Meta_Description'].append(meta_data[2])
        master_dict['Meta_Description_Length'].append(len(meta_data[2]))
        
        ### Additional Page Features
        if len(body_links) != 0:
            master_dict['Body_Content_Links'].append(body_links)
        else:
            master_dict['Body_Content_Links'].append(np.nan)
            
        if number_of_links != 0:
            master_dict['Number_Of_Links'].append(number_of_links)
        else:
            master_dict['Number_Of_Links'].append(0)
            
        if Links_To_Text_Ratio != 0:
            master_dict['Links_To_Text_Ratio'].append(Links_To_Text_Ratio)
        else:
            master_dict['Links_To_Text_Ratio'].append(np.nan)
            
        master_dict['Page_Size_In_Bytes'].append(page_size_in_bytes)
        master_dict['Plain_Text_Size'].append(plain_text_size)
        master_dict['Plain_Text_Rate'].append(plain_text_rate)
        master_dict['Encoding'].append(encoding)
        master_dict['Status_Code'].append(status_code)


    df = pd.DataFrame(master_dict)
        
        #Piping Everything Into A DataFrame
    return df

In [5]:
def get_article_links(article_text, soup):
    soup = BeautifulSoup(article_text)
    body_links = soup.find_all('a')
    number_of_links = len(body_links)
    
    
    #Links To Text Ratio
    Links_To_Text_Ratio = len(body_links) / len(article_text)
    return body_links, number_of_links, Links_To_Text_Ratio

In [23]:
urls = ['https://steemit.com/intellectualproperty/@businesses/protect-your-intellectual-property-by-copywriting-it',
 'https://www.copywritingsecretsforfitpros.com/optin-1570150217064783',
 'https://guruniagaakademi.com/ebook-ebook/',
 'https://www.searchenginejournal.com/seo-writer-conversion-copy-tips/265005/',
 'https://blog.hubspot.com/marketing/scientifically-proven-more-clicks-on-content',
]

In [6]:
Article_scraper(urls)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




Unnamed: 0,HTML_Content,Full_Text,Authors,Publish_Date,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Website_Has_Favicon,...,Meta_Description,Meta_Description_Length,Body_Content_Links,Number_Of_Links,Links_To_Text_Ratio,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Encoding,Status_Code
0,"\n<!DOCTYPE html>\n<html id=""js-context"" class...",Ebola can no longer be called an incurable dis...,"[Sarah Boseley, Susannah Mayhew, Samuel Boland...",2019-08-12,Ebola can no longer be called an incurable dis...,4241,1,0,0,0,...,Congo results show good survival rates for pat...,83,,0,,804660,804590,99.991301,utf-8,200
1,"<!DOCTYPE html><html class=""no-js""><head><meta...",Washington (CNN) The Trump administration rele...,"[Priscilla Alvarez, Geneva Sands, Tami Luhby]",2019-08-12,Washington (CNN) The Trump administration rele...,1470,1,0,0,0,...,The Trump administration released a regulation...,212,,0,,1477164,1476908,99.982669,utf-8,200
