In [41]:
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import pandas as pd
import numpy as np
import re
import hashlib
from math import nan
import os
pd.options.display.max_columns = 50

In [42]:
column_list = ['filename', 'hash', 'size', 'title', 'div_count', 'div_classes', 'div_ids', 
               'input_count', 'input_names',
                'p_count', 'p_classes', 'p_ids', 'script_count', 'script_src', 'script_len',
              'img_count', 'img_src', 'link_count', 'link_hrefs', 'a_count', 'a_hrefs', 
               'headings_contents', 'meta_count', 'meta_names', 
               'iframe_count', 'iframe_src', 'iframe_id', 'iframe_names',
              'span_count', 'span_classes']
df = pd.DataFrame(columns=column_list)

In [43]:
# assign directory
directory = '/path/to/folder'
 
# iterate over files in that directory
for filename in os.scandir(directory):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            html = f.read()
            soup = bs(html)
        with open(filename, 'rb') as f:
            fbytes = f.read() #have to read as binary to calculate hash

        f_hash = hashlib.sha256(fbytes).hexdigest()
        size = len(html)
        
        #Getting things we care about using BeautifulSoup
        title = soup.title.get_text()
        
        divs = soup.find_all('div')
        div_count = len(divs)
        div_classes = [x.get('class') for x in divs]
        div_ids = [x.get('id') for x in divs]
        
        inputs = soup.find_all('input')
        input_count = len(inputs)
        input_names = [x.get('name') for x in inputs]
        
        p_tags = soup.find_all('p')
        p_count = len(p_tags)
        p_classes = [x.get('class') for x in p_tags]
        p_ids = [x.get('id') for x in p_tags]
        
        script_tags = soup.find_all('script')
        script_count = len(script_tags)
        script_src = [x.get('src') for x in script_tags]
        script_len = [len(x.get_text()) for x in script_tags]
        
        img_tags = soup.find_all('img')
        img_count = len(img_tags)
        img_src = [x.get('src') for x in img_tags]
        
        link_tags = soup.find_all('link')
        link_count = len(link_tags)
        link_hrefs = [x.get('href') for x in link_tags]
       
        a_tags = soup.find_all('a')
        a_count = len(a_tags)
        a_hrefs = [x.get('href') for x in a_tags]
        
        headings = soup.find_all(re.compile("^h[1-6]$"))
        headings_contents = [x.get_text() for x in headings]
        
        meta_tags = soup.find_all('meta')
        meta_count = len(meta_tags)
        meta_names = [x.get('name') for x in meta_tags]
        
        iframes = soup.find_all('iframe')
        iframe_count = len(iframes)
        iframe_src = [x.get('src') for x in iframes]
        iframe_ids = [x.get('id') for x in iframes]
        iframe_names = [x.get('name') for x in iframes]
    
        span_tags = soup.find_all('span')
        span_count = len(span_tags)
        span_classes = [x.get('class') for x in span_tags]
        
        entry = {
            "filename": filename.name,
            'hash': str(f_hash), 
            'size': str(size),
            'title': title,
            'div_count': div_count,
            'div_classes': div_classes,
            'div_ids': div_ids,
            'input_count': input_count,
            'input_names': input_names,
            'p_count': p_count,
            'p_classes': p_classes,
            'p_ids': p_ids,
            'script_count': script_count,
            'script_src': script_src,
            'script_len': script_len,
            'img_count': img_count,
            'img_src': img_src,
            'link_count': link_count,
            'link_hrefs': link_hrefs,
            'a_count': a_count,
            'a_hrefs': a_hrefs,
            'headings_contents': headings_contents,
            'meta_count': meta_count,
            'meta_names': meta_names,
            'iframe_count': iframe_count,
            'iframe_src': iframe_src,
            'iframe_id': iframe_ids,
            'iframe_names': iframe_names,
            'span_count': span_count,
            'span_classes': span_classes}
        
        entry_df = pd.DataFrame(columns=column_list)
        
        for k, v in entry.items():
            entry_df.at[0, k] = v
        
        df  = pd.concat([df, entry_df], ignore_index=True)
        
    except:
        print("Error when attempting to parse " + str(filename))

In [44]:
df

Unnamed: 0,filename,hash,size,title,div_count,div_classes,div_ids,input_count,input_names,p_count,p_classes,p_ids,script_count,script_src,script_len,img_count,img_src,link_count,link_hrefs,a_count,a_hrefs,headings_contents,meta_count,meta_names,iframe_count,iframe_src,iframe_id,iframe_names,span_count,span_classes
0,6a0f7b6f57e2032a6a8c90bc493818bd280727e22c91fc...,6a0f7b6f57e2032a6a8c90bc493818bd280727e22c91fc...,855920,New website 1 » Home,62,"[None, None, [layerContainer], [ww_footer], [c...","[body, container, None, footerGroup_en, footer...",9,"[None, None, None, None, None, None, None, Non...",4,"[None, [ww_advert_x], None, None]","[None, None, None, ww_advertisement_button]",29,"[None, None, None, None, None, None, None, Non...","[2, 3082, 166, 204, 2941, 3191, 19203, 166, 0,...",1,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",6,"[https://www.googletagmanager.com, https://you...",4,"[https://mail.yahoo.com/d/folders/1, https://w...","[Sign in, to access AT&T Mail and, Currently.com]",13,"[None, None, None, None, None, None, None, Non...",0,[],[],[],0,[]
1,678716d779a0890dd66b773495fb8d1a6a73b3a0b749fd...,678716d779a0890dd66b773495fb8d1a6a73b3a0b749fd...,30611,Login Screen | Upgrading,1,[None],[app],0,[],0,[],[],11,[https://cdn3.editmysite.com/app/checkout/asse...,"[0, 0, 0, 0, 0, 0, 0, 140, 19979, 1287, 2026]",0,[],22,"[/square.ico, https://cdn3.editmysite.com/app/...",0,[],[],13,"[None, viewport, generator, apple-mobile-web-a...",0,[],[],[],0,[]
2,ac2a08f4ac6fccaaaf94c7ca5b1a9345701bf1b02ddf7b...,ac2a08f4ac6fccaaaf94c7ca5b1a9345701bf1b02ddf7b...,224,https://cutt.ly/ANl2Nzu,0,[],[],0,[],0,[],[],1,[None],[68],0,[],0,[],0,[],[],1,[None],0,[],[],[],0,[]
3,b5ca56b3a3b54d881bca4d5b6177f05c8a5c67a29a0438...,b5ca56b3a3b54d881bca4d5b6177f05c8a5c67a29a0438...,11504,Bu ayın en popüler tanışma sitesi,23,"[[adult], [container], [stepbox, clearfix], [s...","[None, None, None, None, None, None, None, Non...",1,[None],1,[[t-center]],[None],5,"[https://code.jquery.com/jquery-3.3.1.min.js, ...","[0, 1136, 244, 0, 0]",0,[],0,[],3,[/?a=1871809&cr=55836&h=ZrULsNbNjhLVXtbLrSvROx...,"[UYARI!, Aşağıda verilen KURALLARI uygulamak z...",6,"[None, None, viewport, robots, theme-color, ms...",0,[],[],[],5,"[None, None, [shadow], None, [shadow]]"
4,6e2af092be7d9e32e639670c08e935aa933db7280bd3af...,6e2af092be7d9e32e639670c08e935aa933db7280bd3af...,48083,AT&T,137,"[None, None, None, [p9b27], None, [BuY5Fd], [T...","[None, None, None, None, None, None, None, Non...",1,[None],1,"[[CDt4Ke, zfr3Q]]",[None],10,"[None, None, None, None, None, https://apis.go...","[59, 37, 2286, 0, 4218, 0, 455, 463, 40, 0]",1,[https://lh3.googleusercontent.com/JdioLeYlPaM...,4,[https://ssl.gstatic.com/atari/images/public/f...,2,"[/view/attjdjkfk/home, http://www.google.com/u...",[],15,"[None, viewport, None, referrer, None, None, N...",0,[],[],[],22,"[[xjKiLb], [Ce1Y1c], [A37UZe, qgcB3c, iHd5yb],..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,8a52a42eb9a6048ed6d0c3416dc4b87031bfa0f56299e6...,8a52a42eb9a6048ed6d0c3416dc4b87031bfa0f56299e6...,30611,Login Screen | Upgrading,1,[None],[app],0,[],0,[],[],11,[https://cdn3.editmysite.com/app/checkout/asse...,"[0, 0, 0, 0, 0, 0, 0, 140, 19979, 1287, 2026]",0,[],22,"[/square.ico, https://cdn3.editmysite.com/app/...",0,[],[],13,"[None, viewport, generator, apple-mobile-web-a...",0,[],[],[],0,[]
96,0c4411520a549dfb31c0030b1dc3741e2885183aed39b7...,0c4411520a549dfb31c0030b1dc3741e2885183aed39b7...,28963,Login Screen | at&t,1,[None],[app],0,[],0,[],[],11,[https://cdn3.editmysite.com/app/checkout/asse...,"[0, 0, 0, 0, 0, 0, 0, 140, 18191, 1290, 2026]",0,[],22,"[https://www.weebly.com/favicon.ico, https://c...",0,[],[],13,"[None, viewport, generator, apple-mobile-web-a...",0,[],[],[],0,[]
97,25649c5cebc6fbb566ef3ceb92d6bc1e8f93f84bccce67...,25649c5cebc6fbb566ef3ceb92d6bc1e8f93f84bccce67...,22635,AT&T - Home,54,"[[wrapper], [edison-header], [container], [hea...","[None, None, None, None, None, None, None, Non...",8,"[_u828699259873465707, _u576646981696144953, w...",0,[],[],26,"[/files/theme/MutationObserver.js, None, https...","[0, 117, 0, 0, 0, 1756, 56, 266, 35, 32, 0, 22...",3,"[/uploads/1/4/3/6/143662109/attdownload.png, /...",7,"[//fonts.googleapis.com/css?family=Karla:400,7...",5,"[/, #, None, None, https://www.weebly.com/sign...",[Sign in],8,"[None, None, None, None, None, None, None, vie...",0,[],[],[],8,"[[wsite-logo], [form-required], [form-required..."
98,ed49a00517c648490f5d071b92aa6cdae9654410c7ea70...,ed49a00517c648490f5d071b92aa6cdae9654410c7ea70...,1060,HTML Document,0,[],[],0,[],0,[],[],1,[None],[576],0,[],0,[],0,[],[],5,"[None, viewport, X-UA-Compatible, robots, None]",0,[],[],[],0,[]


In [52]:
df[(df['div_count'] > 0) & (df['img_count'] > 0)]

Unnamed: 0,filename,hash,size,title,div_count,div_classes,div_ids,input_count,input_names,p_count,p_classes,p_ids,script_count,script_src,script_len,img_count,img_src,link_count,link_hrefs,a_count,a_hrefs,headings_contents,meta_count,meta_names,iframe_count,iframe_src,iframe_id,iframe_names,span_count,span_classes
0,6a0f7b6f57e2032a6a8c90bc493818bd280727e22c91fc...,6a0f7b6f57e2032a6a8c90bc493818bd280727e22c91fc...,855920,New website 1 » Home,62,"[None, None, [layerContainer], [ww_footer], [c...","[body, container, None, footerGroup_en, footer...",9,"[None, None, None, None, None, None, None, Non...",4,"[None, [ww_advert_x], None, None]","[None, None, None, ww_advertisement_button]",29,"[None, None, None, None, None, None, None, Non...","[2, 3082, 166, 204, 2941, 3191, 19203, 166, 0,...",1,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",6,"[https://www.googletagmanager.com, https://you...",4,"[https://mail.yahoo.com/d/folders/1, https://w...","[Sign in, to access AT&T Mail and, Currently.com]",13,"[None, None, None, None, None, None, None, Non...",0,[],[],[],0,[]
4,6e2af092be7d9e32e639670c08e935aa933db7280bd3af...,6e2af092be7d9e32e639670c08e935aa933db7280bd3af...,48083,AT&T,137,"[None, None, None, [p9b27], None, [BuY5Fd], [T...","[None, None, None, None, None, None, None, Non...",1,[None],1,"[[CDt4Ke, zfr3Q]]",[None],10,"[None, None, None, None, None, https://apis.go...","[59, 37, 2286, 0, 4218, 0, 455, 463, 40, 0]",1,[https://lh3.googleusercontent.com/JdioLeYlPaM...,4,[https://ssl.gstatic.com/atari/images/public/f...,2,"[/view/attjdjkfk/home, http://www.google.com/u...",[],15,"[None, viewport, None, referrer, None, None, N...",0,[],[],[],22,"[[xjKiLb], [Ce1Y1c], [A37UZe, qgcB3c, iHd5yb],..."
7,ef426030f692a6f6bfcaa93e456ccb7baa63a8122dbe42...,ef426030f692a6f6bfcaa93e456ccb7baa63a8122dbe42...,28688,uegih | Linkr.Bio,60,"[None, [landing_wrapper], [land], [land-backgr...","[app, None, None, None, None, None, None, None...",0,[],1,[[link-title]],[None],14,[https://www.googletagmanager.com/gtag/destina...,"[0, 0, 0, 0, 0, 222, 0, 0, 0, 0, 4486, 0, 0, 0]",6,"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",12,"[https://static.linkr.bio/favicon.ico?v=1, /im...",1,[None],[],20,"[None, None, None, viewport, description, face...",3,[https://googleads.g.doubleclick.net/pagead/ad...,"[aswift_0, None, google_esf]","[aswift_0, None, google_esf]",14,"[None, None, [portrait-container, avatar-pictu..."
8,dfc7d152792e643688d4f2a93a28d6e55bffe206b660ee...,dfc7d152792e643688d4f2a93a28d6e55bffe206b660ee...,376464,PCLIFE-SHOP · 送料無料,353,"[[mobile-container, mobile-main-menu-container...","[None, None, None, None, None, None, None, Non...",21,"[search, quantity, product_id, quantity, produ...",1,"[[text-center, cart-empty]]",[None],19,"[None, None, catalog/view/theme/journal3/lib/m...","[3013, 8213, 0, 0, 0, 0, 332, 179, 0, 0, 0, 0,...",30,[https://dysonup.com/image/cache/catalog/2560p...,11,[catalog/view/theme/journal3/icons/fonts/icomo...,109,"[None, None, None, https://dysonup.com/index.p...","[James Dyson Award 2021 国内最優秀賞が決定, コードレス掃除機, ヘ...",19,"[None, viewport, None, description, None, None...",0,[],[],[],185,"[None, None, [links-text], [links-text], [link..."
17,f227879f175a575a6b38ba589e992c6ca5b817a62f7f67...,f227879f175a575a6b38ba589e992c6ca5b817a62f7f67...,18037,hfu4 | Linkr.Bio,58,"[None, None, [landing_wrapper], [land], [land-...","[__nuxt, __layout, None, None, None, None, Non...",0,[],1,[[link-title]],[None],8,[https://pagead2.googlesyndication.com/pagead/...,"[0, 257, 0, 3503, 0, 0, 0, 0]",6,"[data:image/png;base64,iVBORw0KGgoAAAANSUhEUgA...",9,"[/favicon.ico, /_nuxt/3d1a81b.js, /_nuxt/555d5...",0,[],[],16,"[None, viewport, facebook-domain-verification,...",0,[],[],[],12,"[None, None, [portrait-container, avatar-pictu..."
18,edb1aabeadf067cbbc07d8596228c8e5dcc1017c71fd4f...,edb1aabeadf067cbbc07d8596228c8e5dcc1017c71fd4f...,22635,AT&T - Home,54,"[[wrapper], [edison-header], [container], [hea...","[None, None, None, None, None, None, None, Non...",8,"[_u239776137349356626, _u613351968785561938, w...",0,[],[],26,"[/files/theme/MutationObserver.js, None, https...","[0, 117, 0, 0, 0, 1756, 56, 266, 35, 32, 0, 22...",3,"[/uploads/1/4/3/6/143662151/attdownload.png, /...",7,"[//fonts.googleapis.com/css?family=Karla:400,7...",5,"[/, #, None, None, https://www.weebly.com/sign...",[Sign in],8,"[None, None, None, None, None, None, None, vie...",0,[],[],[],8,"[[wsite-logo], [form-required], [form-required..."
20,b0cf531a27ccd583c0a085b5174f0a8d4550d70ff1a28b...,b0cf531a27ccd583c0a085b5174f0a8d4550d70ff1a28b...,4196,Login Screen,14,"[[navigation, w-nav], [navigation-wrap], [menu...","[None, None, None, None, None, None, None, Non...",3,"[email, field, None]",0,[],[],5,[https://ajax.googleapis.com/ajax/libs/webfont...,"[0, 185, 181, 0, 0]",3,[https://uploads-ssl.webflow.com/635667e9cb91f...,3,[https://uploads-ssl.webflow.com/635667e9cb91f...,1,[/],[],7,"[None, None, None, None, None, viewport, gener...",0,[],[],[],0,[]
21,0d09c071eb51ca856189b72351d499a97adc6afd90e36f...,0d09c071eb51ca856189b72351d499a97adc6afd90e36f...,7464,【楽天】ログイン,16,"[None, None, None, None, None, None, None, Non...","[None, None, None, container, language, conten...",21,"[geGeneralTags, geGeneralTags, geGeneralTags, ...",8,"[[comment], [fomName], [fomName], None, [cente...","[None, None, None, None, None, None, None, None]",11,"[static/js/jquery-1.12.4.min.js, static/js/hin...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 254, 146]",7,"[static/picture/rakuten_pc_32px@2x_wm.png, sta...",5,"[static/css/ichiba_chat_appender_v1_0.css, sta...",15,"[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #]","[, 楽天会員ログイン, まだ楽天会員に登録されていない方]",3,"[None, None, None]",0,[],[],[],4,"[None, None, None, None]"
22,9b86940e0c1bc130220d50c020216d8f6b7e888ad9057f...,9b86940e0c1bc130220d50c020216d8f6b7e888ad9057f...,13905,Najpopularniejsza strona randkowa w tym miesiącu,49,"[[adult], [logo], [location], [heading, t-cent...","[None, None, None, None, None, None, None, Non...",1,[None],5,"[[t-center], None, None, None, None]","[None, None, None, None, None]",6,"[https://code.jquery.com/jquery-3.3.1.min.js, ...","[0, 0, 1881, 242, 0, 0]",1,[/lstatic/2a5dbe2661c9bda678132269afe633c9/ima...,0,[],9,[/?a=1871809&cr=54238&h=OPUksmRmZSNNpaIdQBuxlm...,"[Uwaga!, Pytanie 1, Pytanie 2, Pytanie 3, Dzię...",6,"[None, None, viewport, robots, theme-color, ms...",0,[],[],[],9,"[None, None, [city], None, [c-accent], None, [..."
24,d33b26093b8ade3dc2b50575ac563ce11ae930a2502318...,d33b26093b8ade3dc2b50575ac563ce11ae930a2502318...,22636,AT&T - Home,54,"[[wrapper], [edison-header], [container], [hea...","[None, None, None, None, None, None, None, Non...",8,"[_u828699259873465707, _u576646981696144953, w...",0,[],[],26,"[/files/theme/MutationObserver.js, None, https...","[0, 117, 0, 0, 0, 1756, 56, 266, 35, 32, 0, 22...",3,"[/uploads/1/4/3/6/143662109/attdownload.png, /...",7,"[//fonts.googleapis.com/css?family=Karla:400,7...",5,"[/, #, None, None, https://www.weebly.com/sign...",[Sign in],8,"[None, None, None, None, None, None, None, vie...",0,[],[],[],8,"[[wsite-logo], [form-required], [form-required..."
