In [28]:
#required libraries
import pickle
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
import textstat
import readability
from textblob import TextBlob

In [29]:
#import pickled files from previous stages
with open("mined_sites_0_30.txt", "rb") as fp1:
    import_data = pickle.load(fp1)
    
with open("kw_clean.txt", "rb") as fp2:
    import_kw = pickle.load(fp2)

In [30]:
#convert flat array into a nested one
def build_master(master):
    master = pd.DataFrame(master)
    final = []
    count = 1
    kwn = 0
    temp = []
    while count<=master.shape[0]: #change
        item = {
            'kwn' : master['kwn'][count-1],
            'kw' : master['kw'][count-1].replace('+',' '),
            'link' : master['link'][count-1],
            'soup' : BeautifulSoup(master['soup'][count-1]),
            'text' : master['text'][count-1],
            'score' : 1
        }
        temp.append(item)
        
        if count % 16 == 0:
            final.append(temp)
            temp = []
            kwn+=1
            
        count+=1
        
    return final

In [31]:
master = build_master(import_data)
del import_data

In [32]:
#remove academic websites altogether; this function includes other unwanted sites
def academic(site):
    
    remove_list = ['.edu','.pdf','university','stackoverflow.com/questions','stackexchange.com/questions']
    
    if any(substring in site["link"] for substring in remove_list):
        return 0
    else:
        return site['score']

In [33]:
#determine if site commercial if it contains 'Solutions' or 'Products' strings in its header HTML
def commercial(site):
    temp = str(site["soup"].select("ul")[0:2])
    temp = BeautifulSoup(temp, "html.parser")
    temp = temp.select('a')
    for k in temp:
        if 'Solutions' in k.text or 'Products' in k.text:
            return 1
            break
    return 0

In [34]:
#determines esotericism of text; calculates probability of a word being another CS concept from the SO dataset
def esoteric(site):
    temp_score = 0
    
    for i in range(len(import_kw)):
        excl_list = import_kw[:i] + import_kw[i:]
        for k in excl_list:
            if k in site["text"].lower():               #NEW
                temp_score += 1
    
    temp_score /= (len(site["text"].split())+1)
                    
    return temp_score 

In [35]:
def subjective(site):
    return TextBlob(site['text']).subjectivity

In [36]:
def readable(site):
    return textstat.flesch_reading_ease(site["text"])

In [37]:
#weights heuristics based on the number of stdev from mean of given keyword
def distribution(array, heuristic):
    temparr = []
    if heuristic == 'subjective' or heuristic == 'esoteric':
        stdr = np.mean(array, axis = None) + 2*np.std(array, axis = None)
        stdl = np.mean(array, axis = None) + np.std(array, axis = None) 
        for i in array:
            if i >= stdl and i <= stdr:
                temparr.append(1)
            elif i > stdr:
                temparr.append(2)
            else:
                temparr.append(0)
        
        
    elif heuristic == 'readable':
        stdr = np.mean(array, axis = None) - np.std(array, axis = None) 
        stdl = np.mean(array, axis = None) - 2*np.std(array, axis = None)
        for i in array:
            if i <= stdr and i >= stdl:
                temparr.append(1)
            elif i < stdl:
                temparr.append(2)
            else:
                temparr.append(0)
                
    return temparr

In [38]:
#driver for all the above functions; directly subtracts from the scores in master array
def scoregen(heuristic, weight):
    for i in range(len(master)):
        temp_arr = []
        for j in range(len(master[i])):
            print(master[i][j]['link'])
            site = master[i][j]
            if heuristic == 'esoteric':
                temp_arr.append(esoteric(site))
            elif heuristic == 'subjective':
                temp_arr.append(subjective(site))
            elif heuristic == 'readable':
                temp_arr.append(readable(site))
        
        dist_arr = distribution(temp_arr,heuristic)
        dist_arr = [element * weight for element in dist_arr]
        
        print(dist_arr)
        
        for k in range(len(master[i])):
            master[i][k]['score'] -= dist_arr[k]
    return

In [39]:
scoregen('esoteric',0.1)
scoregen('subjective',0.1)
scoregen('readable',0.1)

https://en.wikipedia.org/wiki/Undefined_behavior
https://en.cppreference.com/w/cpp/language/ub
https://en.cppreference.com/w/c/language/behavior
https://www.geeksforgeeks.org/undefined-behavior-c-cpp/
https://raphlinus.github.io/programming/rust/2018/08/17/undefined-behavior.html
https://blog.llvm.org/posts/2011-05-13-what-every-c-programmer-should-know/
https://blog.regehr.org/archives/213
https://embeddedartistry.com/blog/2017/01/09/a-guide-to-undefined-behavior-in-c-and-c-part-1/
https://gist.github.com/Earnestly/7c903f481ff9d29a3dd1
https://stackoverflow.com/questions/367633/what-are-all-the-common-undefined-behaviours-that-a-c-programmer-should-know-a
http://wg21.link/P1705
https://wiki.sei.cmu.edu/confluence/display/c/CC.+Undefined+Behavior
https://wiki.sei.cmu.edu/confluence/display/c/MSC15-C.+Do+not+depend+on+undefined+behavior
https://riptutorial.com/c/topic/364/undefined-behavior
https://softwareengineering.stackexchange.com/questions/398703/why-does-c-have-undefined-behaviou

https://www.essentialsql.com/get-ready-to-learn-sql-database-normalization-explained-in-simple-english/
https://docs.microsoft.com/en-us/office/troubleshoot/access/database-normalization-description
https://beginnersbook.com/2015/05/normalization-in-dbms/
http://agiledata.org/essays/dataNormalization.html
https://www.edureka.co/blog/normalization-in-sql/
https://towardsdatascience.com/database-normalization-explained-53e60a494495
https://www.tutorialspoint.com/dbms/database_normalization.htm
https://www.javatpoint.com/dbms-normalization
https://condor.depaul.edu/gandrus/240IT/accesspages/normalization3.htm
https://blog.saleslayer.com/why-is-database-normalization-so-important
https://www.geeksforgeeks.org/normal-forms-in-dbms/
https://mariadb.com/kb/en/database-normalization-5th-normal-form-and-beyond/
https://mariadb.com/kb/en/database-normalization/
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.2]
https://en.wikipedia.org/wiki/Dependency_injection
http

https://help.alchemer.com/help/encode-an-excel-file-to-utf-8-or-utf-16
http://www.columbia.edu/~fdc/utf8/
https://www.php.net/manual/en/function.utf8-encode.php
[0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0]
https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
https://en.wikipedia.org/wiki/Cross-origin_resource_sharing
https://auth0.com/blog/cors-tutorial-a-guide-to-cross-origin-resource-sharing/
https://www.codecademy.com/articles/what-is-cors
https://web.dev/cross-origin-resource-sharing/
https://portswigger.net/web-security/cors
https://www.moesif.com/blog/technical/cors/Authoritative-Guide-to-CORS-Cross-Origin-Resource-Sharing-for-REST-APIs/
https://www.w3.org/TR/2020/SPSD-cors-20200602/
https://www.w3.org/wiki/CORS
https://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html
https://enable-cors.org/
https://owasp.org/www-community/attacks/CORS_OriginHeaderScrutiny
https://spring.io/guides/gs/rest-service-cors/
https://cloud.google.com/storage/docs/

https://www.geeksforgeeks.org/strategy-pattern-set-1/
https://blog.bitsrc.io/keep-it-simple-with-the-strategy-design-pattern-c36a14c985e9
https://www.dofactory.com/net/strategy-design-pattern
https://www.oodesign.com/strategy-pattern.html
https://codewithshadman.com/strategy-pattern-csharp/
https://www.topcoder.com/thrive/articles/The%20Strategy%20Pattern%20in%20C
https://medium.com/dev-genius/design-patterns-strategy-pattern-d57a13c593b7
https://medium.com/fintechexplained/strategy-design-pattern-a1f643617d5c
https://www.ionos.com/digitalguide/websites/web-development/strategy-pattern/
https://www.freecodecamp.org/news/the-strategy-pattern-explained-using-java-bc30542204e0/
https://www.programmingwithwolfgang.com/strategy-pattern/
https://deviq.com/strategy-design-pattern/
[0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
https://en.wikipedia.org/wiki/Factory_method_pattern
https://refactoring.guru/design-patterns/factory-method
https://www.tutorialspoin

https://www.digitalocean.com/community/tutorials/how-to-troubleshoot-common-http-error-codes
https://www.whoishostingthis.com/resources/http-status-codes/
https://www.tutorialrepublic.com/html-reference/http-status-codes.php
[0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2]
https://en.wikipedia.org/wiki/Undefined_behavior
https://en.cppreference.com/w/cpp/language/ub
https://en.cppreference.com/w/c/language/behavior
https://www.geeksforgeeks.org/undefined-behavior-c-cpp/
https://raphlinus.github.io/programming/rust/2018/08/17/undefined-behavior.html
https://blog.llvm.org/posts/2011-05-13-what-every-c-programmer-should-know/
https://blog.regehr.org/archives/213
https://embeddedartistry.com/blog/2017/01/09/a-guide-to-undefined-behavior-in-c-and-c-part-1/
https://gist.github.com/Earnestly/7c903f481ff9d29a3dd1
https://stackoverflow.com/questions/367633/what-are-all-the-common-undefined-behaviours-that-a-c-programmer-should-know-a
http://wg21.link/P1705
https:

https://www.geeksforgeeks.org/mvc-design-pattern/
[0.0, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
https://en.wikipedia.org/wiki/Database_normalization
https://www.guru99.com/database-normalization.html
https://www.studytonight.com/dbms/database-normalization.php
https://www.essentialsql.com/get-ready-to-learn-sql-database-normalization-explained-in-simple-english/
https://docs.microsoft.com/en-us/office/troubleshoot/access/database-normalization-description
https://beginnersbook.com/2015/05/normalization-in-dbms/
http://agiledata.org/essays/dataNormalization.html
https://www.edureka.co/blog/normalization-in-sql/
https://towardsdatascience.com/database-normalization-explained-53e60a494495
https://www.tutorialspoint.com/dbms/database_normalization.htm
https://www.javatpoint.com/dbms-normalization
https://condor.depaul.edu/gandrus/240IT/accesspages/normalization3.htm
https://blog.saleslayer.com/why-is-database-normalization-so-important
https://www.geeksfo

http://tutorials.jenkov.com/unicode/utf-8.html
https://help.alchemer.com/help/encode-an-excel-file-to-utf-8-or-utf-16
http://www.columbia.edu/~fdc/utf8/
https://www.php.net/manual/en/function.utf8-encode.php
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
https://en.wikipedia.org/wiki/Cross-origin_resource_sharing
https://auth0.com/blog/cors-tutorial-a-guide-to-cross-origin-resource-sharing/
https://www.codecademy.com/articles/what-is-cors
https://web.dev/cross-origin-resource-sharing/
https://portswigger.net/web-security/cors
https://www.moesif.com/blog/technical/cors/Authoritative-Guide-to-CORS-Cross-Origin-Resource-Sharing-for-REST-APIs/
https://www.w3.org/TR/2020/SPSD-cors-20200602/
https://www.w3.org/wiki/CORS
https://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html
https://enable-cors.org/
https://owasp.org/www-community/attacks/CORS_OriginHeaderScrutiny
https://spring.io/guides/gs/rest-servi

https://www.oodesign.com/strategy-pattern.html
https://codewithshadman.com/strategy-pattern-csharp/
https://www.topcoder.com/thrive/articles/The%20Strategy%20Pattern%20in%20C
https://medium.com/dev-genius/design-patterns-strategy-pattern-d57a13c593b7
https://medium.com/fintechexplained/strategy-design-pattern-a1f643617d5c
https://www.ionos.com/digitalguide/websites/web-development/strategy-pattern/
https://www.freecodecamp.org/news/the-strategy-pattern-explained-using-java-bc30542204e0/
https://www.programmingwithwolfgang.com/strategy-pattern/
https://deviq.com/strategy-design-pattern/
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.1, 0.1, 0.0, 0.0]
https://en.wikipedia.org/wiki/Factory_method_pattern
https://refactoring.guru/design-patterns/factory-method
https://www.tutorialspoint.com/design_pattern/factory_pattern.htm
https://sourcemaking.com/design_patterns/factory_method
https://www.javatpoint.com/factory-method-design-pattern
https://www.geeksforgeeks.org/design-p

https://en.cppreference.com/w/cpp/language/ub
https://en.cppreference.com/w/c/language/behavior
https://www.geeksforgeeks.org/undefined-behavior-c-cpp/
https://raphlinus.github.io/programming/rust/2018/08/17/undefined-behavior.html
https://blog.llvm.org/posts/2011-05-13-what-every-c-programmer-should-know/
https://blog.regehr.org/archives/213
https://embeddedartistry.com/blog/2017/01/09/a-guide-to-undefined-behavior-in-c-and-c-part-1/
https://gist.github.com/Earnestly/7c903f481ff9d29a3dd1
https://stackoverflow.com/questions/367633/what-are-all-the-common-undefined-behaviours-that-a-c-programmer-should-know-a
http://wg21.link/P1705
https://wiki.sei.cmu.edu/confluence/display/c/CC.+Undefined+Behavior
https://wiki.sei.cmu.edu/confluence/display/c/MSC15-C.+Do+not+depend+on+undefined+behavior
https://riptutorial.com/c/topic/364/undefined-behavior
https://softwareengineering.stackexchange.com/questions/398703/why-does-c-have-undefined-behaviour-ub-and-other-languages-like-c-or-java
https://s

https://docs.microsoft.com/en-us/office/troubleshoot/access/database-normalization-description
https://beginnersbook.com/2015/05/normalization-in-dbms/
http://agiledata.org/essays/dataNormalization.html
https://www.edureka.co/blog/normalization-in-sql/
https://towardsdatascience.com/database-normalization-explained-53e60a494495
https://www.tutorialspoint.com/dbms/database_normalization.htm
https://www.javatpoint.com/dbms-normalization
https://condor.depaul.edu/gandrus/240IT/accesspages/normalization3.htm
https://blog.saleslayer.com/why-is-database-normalization-so-important
https://www.geeksforgeeks.org/normal-forms-in-dbms/
https://mariadb.com/kb/en/database-normalization-5th-normal-form-and-beyond/
https://mariadb.com/kb/en/database-normalization/
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.1]
https://en.wikipedia.org/wiki/Dependency_injection
https://www.freecodecamp.org/news/a-quick-intro-to-dependency-injection-what-it-is-and-when-to-use-it-7578c8

https://www.php.net/manual/en/function.utf8-encode.php
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.2]
https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS
https://en.wikipedia.org/wiki/Cross-origin_resource_sharing
https://auth0.com/blog/cors-tutorial-a-guide-to-cross-origin-resource-sharing/
https://www.codecademy.com/articles/what-is-cors
https://web.dev/cross-origin-resource-sharing/
https://portswigger.net/web-security/cors
https://www.moesif.com/blog/technical/cors/Authoritative-Guide-to-CORS-Cross-Origin-Resource-Sharing-for-REST-APIs/
https://www.w3.org/TR/2020/SPSD-cors-20200602/
https://www.w3.org/wiki/CORS
https://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html
https://enable-cors.org/
https://owasp.org/www-community/attacks/CORS_OriginHeaderScrutiny
https://spring.io/guides/gs/rest-service-cors/
https://cloud.google.com/storage/docs/cross-origin
https://learn.akamai.com/en-us/webhelp/api-gateway/api-gateway-user-guide/GUID-FC33AE38-EE5B-

https://www.tutorialspoint.com/design_pattern/factory_pattern.htm
https://sourcemaking.com/design_patterns/factory_method
https://www.javatpoint.com/factory-method-design-pattern
https://www.geeksforgeeks.org/design-patterns-set-2-factory-method/
https://realpython.com/factory-method-python/
https://www.dofactory.com/net/factory-method-design-pattern
https://medium.com/@info.anikdey003/factory-method-design-pattern-277dd4bd3a11
https://www.baeldung.com/creational-design-patterns
https://www.oreilly.com/library/view/head-first-design/0596007124/ch04.html
https://dzone.com/articles/factory-method-design-pattern
https://stackabuse.com/the-factory-method-design-pattern-in-python/
https://stackabuse.com/factory-method-design-pattern-in-java/
https://dart.academy/creational-design-patterns-for-dart-and-flutter-factory-method/
https://narbase.com/2020/06/15/design-patterns-factory-method-with-kotlin-examples/
[0.0, 0.0, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
htt

In [40]:
#these factors are done separately as they do not use distribution; 
#if commercial subtract 0.1 
#if academic set score = 0; eliminating with certainty 
for i in range(len(master)):
    for j in range(len(master[i])):
        master[i][j]["score"] -= commercial(master[i][j])*0.1
        master[i][j]["score"] = academic(master[i][j])

In [41]:
#sorts website objects by score and selects top x num; before creating a flat array to pickle
FINAL = []
for i in range(len(master)):
    temp_score = {}
    for j in range(len(master[i])):
        temp_score.update({j:master[i][j]["score"]})
    temp_score = dict(sorted(temp_score.items(), key=lambda x: x[1], reverse=True))
    temp_score2 = list(temp_score.keys())[0:8] 
    for index in temp_score2:
        item = {
            "kwn": i,
            "kw": master[i][index]["kw"],
            "link": master[i][index]["link"],
            "text": master[i][index]["text"]
        }
        FINAL.append(item)

In [42]:
#exporting remaining website objects to local file
with open("mined_sites_filtered_0_30.txt", "wb") as fp:   #Pickling
    pickle.dump(FINAL, fp)