In [1]:
#required packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import time
import urllib.parse 
import re

#reduces warnings output during requests without verification
requests.packages.urllib3.disable_warnings()
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
try:
    requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST += 'HIGH:!DH:!aNULL'
except AttributeError:
    pass



In [2]:
#clean original concept keyword CSV file into array

df_orig = pd.read_csv("dataframe.csv")
df_clean = []

for i in df_orig["Link"]:
    clean = urllib.parse.unquote(str(i))
    clean = clean.replace('wiki/','').replace('_',' ').lower()
    df_clean.append(clean)
    
    
df_clean = df_clean[0:300]
    
df_clean

['undefined behavior',
 'sql injection',
 'iso 8601',
 'same origin policy',
 'singleton pattern',
 'single responsibility principle',
 'model–view–controller',
 'database normalization',
 'dependency injection',
 'byte order mark',
 'trie',
 "don't repeat yourself",
 'jsonp',
 'java version history',
 'utf-8',
 'cross-origin resource sharing',
 'observer pattern',
 'base64',
 'maintenance mode',
 'android (operating system)',
 'endianness',
 'levenshtein distance',
 'strategy pattern',
 'factory method pattern',
 'json',
 'regular expression',
 'liskov substitution principle',
 'legacy system',
 'representational state transfer',
 'list of http status codes',
 'w/api.php',
 'unix time',
 'visitor pattern',
 'hash table',
 'cross-site scripting',
 'decorator pattern',
 'post/redirect/get',
 'short-circuit evaluation',
 'cron',
 'ascii',
 'data uri scheme',
 'floating point',
 'coordinated universal time',
 'resource acquisition is initialization',
 "two's complement",
 'fisher–yates sh

In [3]:
# with open("kw_clean.txt", "wb") as fp:   #Pickling
#     pickle.dump(df_clean, fp)

In [4]:
#request URL function; can handle various connection errors
connect_fail_cause = []
def extract_html(link):
    try:
        resp = requests.get(link, verify=False, timeout=15) #will fail after 15secs for efficiency
        soup = BeautifulSoup(resp.content, "html.parser")
        connect_fail_cause.append(0)
        return soup
#         return soup2
    except requests.ConnectionError:
        connect_fail_cause.append(1)
        return "CONNECT_FUBAR"
    except requests.Timeout:
        connect_fail_cause.append(2)
        return "CONNECT_FUBAR"
    except requests.TooManyRedirects:
        connect_fail_cause.append(3)
        return "CONNECT_FUBAR"

In [5]:
#extract text from website html

def html_to_text(html):
    
    final = ''
    
    if isinstance(html,str)==False:
        text = html.find_all(['p','li']) #extracted tag types


        for p in text:
            final += p.text

        final = final.replace('\n',' ').replace('\r',' ').replace('\t','') #remove useless newline etc chars
        final = re.sub(' +', ' ', final) #remove space greater than one char length
    
    return final





In [6]:
#exclude site objects empty / insufficient length

def not_empty(site_obj):
    if site_obj['soup'] == 'CONNECT_FUBAR':
        return False
    elif not site_obj['soup']:
        return False
    elif (len(site_obj['raw_text'])>=120) == False:
        return False
    else:
        return True

In [7]:
single_page_time = []
google_page_time = []

In [8]:
#drives previous functions to gather HTML/text of x pages for a given keyword into an array
def fetch_pages(query, pages_per_kw):
    print('\nnew_kw_query')
    start_time_google = time.time()
    keyword = query
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}&num=24" #set number of results as num
    headers = {"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"}
    resp = requests.get(URL, headers=headers) #headers ensure Google server consistently accepts requests
    google_page_time.append(time.time()-start_time_google)
    
    if resp.status_code == 200:
        final = []
        soup = BeautifulSoup(resp.content, "html.parser")
        print('new_kw_query_success')
        for g in soup.find_all('div', class_='rc'): #div.rc element contains search result URLs
            if not g == False and len(final)<pages_per_kw: #this condition ensures each keyword consistently contains x pages
                start_time_single = time.time()
                title = g.find('h3').text
                link = g.find('a')['href']
                print(link)
                soup2 = extract_html(link)
                print('finished_extracting')
                raw_text = html_to_text(soup2)
                item = { #site object
                    "kw": keyword,
                    "title": title,
                    "link": link,
                    "soup": soup2,
                    "raw_text": raw_text
                        }
                
                if not_empty(item):
                    final.append(item)
                single_page_time.append(time.time()-start_time_single)
                print('site_query_complete')
    
    return final

In [9]:
#execute fetch_pages() here
master = []
start = 0
stop = 30
count = 0
x = df_clean[start:stop]
for kw in x: #NOTE: master array is nested in two layers: keyword---> site
    master.append(fetch_pages(kw, 16)) #set number of kw here
    count+=1
    print(int((count/(stop-start))*100),'% complete') #prints % progress of scraping


new_kw_query
new_kw_query_success
https://en.wikipedia.org/wiki/Undefined_behavior
finished_extracting
site_query_complete
https://en.cppreference.com/w/cpp/language/ub
finished_extracting
site_query_complete
https://en.cppreference.com/w/c/language/behavior
finished_extracting
site_query_complete
https://www.geeksforgeeks.org/undefined-behavior-c-cpp/
finished_extracting
site_query_complete
https://raphlinus.github.io/programming/rust/2018/08/17/undefined-behavior.html
finished_extracting
site_query_complete
https://wiki.c2.com/?UndefinedBehavior
finished_extracting
site_query_complete
https://blog.llvm.org/posts/2011-05-13-what-every-c-programmer-should-know/
finished_extracting
site_query_complete
https://blog.regehr.org/archives/213
finished_extracting
site_query_complete
https://embeddedartistry.com/blog/2017/01/09/a-guide-to-undefined-behavior-in-c-and-c-part-1/
finished_extracting
site_query_complete
https://gist.github.com/Earnestly/7c903f481ff9d29a3dd1
finished_extracting
sit

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


finished_extracting
site_query_complete
https://docs.fedoraproject.org/en-US/Fedora_Security_Team/1/html/Secure_Ruby_Development_Guide/ch03s02s02.html
finished_extracting
site_query_complete
https://www.usenix.org/conference/usenixsecurity17/technical-sessions/presentation/schwenk
finished_extracting
site_query_complete
https://security.stackexchange.com/questions/8264/why-is-the-same-origin-policy-so-important
finished_extracting
site_query_complete
http://aosabook.org/en/500L/the-same-origin-policy.html
finished_extracting
site_query_complete
https://link.springer.com/chapter/10.1007/978-3-642-22137-8_11
finished_extracting
site_query_complete
13 % complete

new_kw_query
new_kw_query_success
https://en.wikipedia.org/wiki/Singleton_pattern
finished_extracting
site_query_complete
https://www.tutorialspoint.com/design_pattern/singleton_pattern.htm
finished_extracting
site_query_complete
https://refactoring.guru/design-patterns/singleton
finished_extracting
site_query_complete
https://so

finished_extracting
site_query_complete
https://docs.microsoft.com/en-us/aspnet/core/fundamentals/dependency-injection
finished_extracting
site_query_complete
https://angular.io/guide/dependency-injection
finished_extracting
site_query_complete
https://www.youtube.com/watch/IKD2-MAkXyQ
finished_extracting
site_query_complete
https://martinfowler.com/articles/injection.html
finished_extracting
site_query_complete
https://developer.android.com/training/dependency-injection
finished_extracting
site_query_complete
https://www.vogella.com/tutorials/DependencyInjection/article.html
finished_extracting
site_query_complete
https://auth0.com/blog/dependency-injection-in-dotnet-core/
finished_extracting
site_query_complete
https://developer.android.com/training/dependency-injection/manual
finished_extracting
site_query_complete
https://doc.sitecore.com/developers/93/sitecore-experience-manager/en/dependency-injection.html
finished_extracting
site_query_complete
https://docs.angularjs.org/guide/d

finished_extracting
site_query_complete
http://techxposer.com/2017/11/14/understanding-jsonp-security-issues/
finished_extracting
site_query_complete
https://gist.github.com/3380256
finished_extracting
site_query_complete
https://github.com/jaubourg/jquery-jsonp
finished_extracting
site_query_complete
https://www.getfilecloud.com/blog/using-jsonp-for-cross-domain-requests/
finished_extracting
site_query_complete
https://www.loginradius.com/engineering/blog/understanding-jsonp/
finished_extracting
site_query_complete
https://dev.socrata.com/docs/cors-and-jsonp.html
finished_extracting
site_query_complete
https://dojotoolkit.org/documentation/tutorials/1.6/jsonp/
finished_extracting
site_query_complete
https://www.ionos.com/digitalguide/websites/web-development/jsonp/
finished_extracting
site_query_complete
43 % complete

new_kw_query
new_kw_query_success
https://en.wikipedia.org/wiki/Java_version_history
finished_extracting
site_query_complete
https://en.wikipedia.org/wiki/Java_(softwar

finished_extracting
site_query_complete
https://developer.mozilla.org/en-US/docs/Glossary/Base64
finished_extracting
site_query_complete
https://docs.python.org/3/library/base64.html
finished_extracting
site_query_complete
https://tools.ietf.org/html/rfc4648
finished_extracting
site_query_complete
https://www.base64-image.de/
finished_extracting
site_query_complete
https://golang.org/pkg/encoding/base64/
finished_extracting
site_query_complete
https://cryptii.com/pipes/base64-to-text
finished_extracting
site_query_complete
https://www.freeformatter.com/base64-encoder.html
finished_extracting
site_query_complete
https://ruby-doc.org/stdlib-2.5.3/libdoc/base64/rdoc/Base64.html
finished_extracting
site_query_complete
https://docs.oracle.com/javase/8/docs/api/java/util/Base64.html
finished_extracting
site_query_complete
https://docs.julialang.org/en/v1/stdlib/Base64/
finished_extracting
site_query_complete
https://www.php.net/manual/en/function.base64-encode.php
finished_extracting
site_qu

finished_extracting
site_query_complete
https://towardsdatascience.com/how-to-improve-the-performance-of-a-machine-learning-model-with-post-processing-employing-b8559d2d670a
finished_extracting
site_query_complete
https://www.researchgate.net/publication/262088592_Improved_versions_of_the_Levenshtein_distance_method_for_comparing_sequence_information_in_animals%27_vocalisations_Tests_using_humpback_whale_song
finished_extracting
site_query_complete
https://nlp.stanford.edu/IR-book/html/htmledition/edit-distance-1.html
finished_extracting
site_query_complete
http://www.levenshtein.net/
finished_extracting
site_query_complete
73 % complete

new_kw_query
new_kw_query_success
https://en.wikipedia.org/wiki/Strategy_pattern
finished_extracting
site_query_complete
https://www.tutorialspoint.com/design_pattern/strategy_pattern.htm
finished_extracting
site_query_complete
https://refactoring.guru/design-patterns/strategy
finished_extracting
site_query_complete
https://sourcemaking.com/design_pat

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


finished_extracting
site_query_complete
https://www.baeldung.com/creational-design-patterns
finished_extracting
site_query_complete
https://www.oreilly.com/library/view/head-first-design/0596007124/ch04.html
finished_extracting
site_query_complete
https://dzone.com/articles/factory-method-design-pattern
finished_extracting
site_query_complete
https://stackabuse.com/the-factory-method-design-pattern-in-python/
finished_extracting
site_query_complete
https://stackabuse.com/factory-method-design-pattern-in-java/
finished_extracting
site_query_complete
https://dart.academy/creational-design-patterns-for-dart-and-flutter-factory-method/
finished_extracting
site_query_complete
https://narbase.com/2020/06/15/design-patterns-factory-method-with-kotlin-examples/
finished_extracting
site_query_complete
80 % complete

new_kw_query
new_kw_query_success
https://www.json.org/
finished_extracting
site_query_complete
https://en.wikipedia.org/wiki/JSON
finished_extracting
site_query_complete
https://ww

finished_extracting
site_query_complete
https://blog.dreamfactory.com/what-is-a-legacy-system/
finished_extracting
site_query_complete
93 % complete

new_kw_query
new_kw_query_success
https://en.wikipedia.org/wiki/Representational_state_transfer
finished_extracting
site_query_complete
https://www.codecademy.com/articles/what-is-rest
finished_extracting
site_query_complete
https://medium.com/@sagar.mane006/understanding-rest-representational-state-transfer-85256b9424aa
finished_extracting
site_query_complete
https://searchapparchitecture.techtarget.com/definition/REST-REpresentational-State-Transfer
finished_extracting
site_query_complete
https://www.plesk.com/blog/various/rest-representational-state-transfer/
finished_extracting
site_query_complete
https://www.ics.uci.edu/~fielding/pubs/dissertation/rest_arch_style.htm
finished_extracting
site_query_complete
https://restfulapi.net/
finished_extracting
site_query_complete
https://www.sciencedirect.com/topics/computer-science/representat

In [10]:
result = []
result.append(connect_fail_cause)
result.append(single_page_time)
result.append(google_page_time)

In [11]:
# with open("runtime_50_100.txt", "wb") as fp:
#     pickle.dump(result, fp)

In [12]:
#Example
master[0][0]

{'kw': 'undefined behavior',
 'title': 'Undefined behavior - Wikipedia',
 'link': 'https://en.wikipedia.org/wiki/Undefined_behavior',
 'soup': <!DOCTYPE html>
 
 <html class="client-nojs" dir="ltr" lang="en">
 <head>
 <meta charset="utf-8"/>
 <title>Undefined behavior - Wikipedia</title>
 <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"613436fa-e493-4345-b7cf-a2f1049997ea","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Undefined_behavior","wgTitle":"Undefined behavior","wgCurRevisionId":959672567,"wgRevisionId":959672567,"wgArticleId":515992,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All a

In [13]:
#flatten nested structure in order to pickle; hence kwn
FINAL = []
for i in range(len(master)):
    for index in (range(len(master[i]))):
        item = {
            "kwn": i,
            "kw": master[i][index]["kw"],
            "link": master[i][index]["link"],
            "soup": str(master[i][index]["soup"]), #pickle cannot accept bs4 object, convert to string
            "text": master[i][index]["raw_text"]
        }
        FINAL.append(item)

In [15]:
#export site objects to file for heuristics selection phase
with open("mined_sites_0_30.txt", "wb") as fp:
    pickle.dump(FINAL, fp)