### Keyword Search Matching Algorithm

In [17]:
import re
import pandas as pd

def compare_paragraphs(paragraph1, paragraph2, services):
    
    '''
    function: compare_paragraphs 
        inputs: paragraphs, services
        returns: keyword percentages
    '''

    def extract_keywords(paragraph):
    
        '''
        function: extract_keywords 
            inputs: paragraph
            returns: matching keywords
        '''
    
        keywords = []
        for service in services:
            matches = re.findall(service, paragraph, re.IGNORECASE)
            if matches:
                keywords.extend(matches)
        return keywords
    
    ##-- Extract keywords from each paragraph
    keywords1,keywords2 = extract_keywords(paragraph1),extract_keywords(paragraph2)
    
    ##-- Calculate total % keywords in each paragraph
    all_keywords_percent_calc = lambda kw,pg: len(kw) / len(re.findall("\w+", pg)) * 100
    percentage1,percentage2 = all_keywords_percent_calc(keywords1,paragraph1),all_keywords_percent_calc(keywords2,paragraph2)
    
    ##-- Count each keyword in each paragraph
    keyword_counts1,keyword_counts2 = {},{}
    for keyword in services:
        keyword_counts1[keyword] = keywords1.count(keyword)
        keyword_counts2[keyword] = keywords2.count(keyword)
    
    ##-- Calculate each % keyword in each paragraph
    keywords_percent_calc = lambda pg,kw_count: {keyword: count / len(re.findall("\w+", pg)) * 100 for keyword, count in kw_count.items()}
    keyword_percentages1,keyword_percentages2 = keywords_percent_calc(paragraph1,keyword_counts1),keywords_percent_calc(paragraph2,keyword_counts2)
    
    ##-- Format results as dictionary
    results = {"Document": ["SOW", "SOSL"], "Keywords_%": [percentage1, percentage2]}
    for keyword in services:
        results[keyword] = [keyword_percentages1[keyword], keyword_percentages2[keyword]]
    
    ##-- Return results DataFrame
    return pd.DataFrame.from_dict(results)

##-- Define the top 5 AWS services
services = ["EC2", "S3", "RDS", "Lambda", "DynamoDB"]

##-- Define the two paragraphs to compare
paragraph1 = "Amazon EC2 is a web service that provides resizable compute capacity in the cloud. Amazon S3 provides highly-scalable object storage for developers and IT teams. Amazon RDS makes it easy to set up, operate, and scale a relational database in the cloud."
paragraph2 = "AWS Lambda lets you run code without provisioning or managing servers. Amazon S3 provides highly-scalable object storage for developers and IT teams. Amazon DynamoDB is a fast and flexible NoSQL database service for all applications that need consistent, single-digit millisecond latency at any scale."

##--  Call compare_paragraphs to get % keywords for paragraphs and services
results = compare_paragraphs(paragraph1, paragraph2, services)

##-- Return results DataFrame
display(results)

Unnamed: 0,Document,Keywords_%,EC2,S3,RDS,Lambda,DynamoDB
0,SOW,6.976744,2.325581,2.325581,2.325581,0.0,0.0
1,SOSL,6.521739,0.0,2.173913,0.0,2.173913,2.173913
