In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

from charwords import veryWords, prettyWords, enoughWords, weakWords

import nltk 
from nltk.corpus import wordnet 

In [24]:
def readCaptionsToList(vidId):
    """
    The function creates a word list from captions in the video given as input.

    Input:
    YouTube Video ID

    Output:
    list (str): list of words from the captions
    """
    
    transcript_list = YouTubeTranscriptApi.list_transcripts(vidId)
    captions = ["NoText"]
    try:
        captions = YouTubeTranscriptApi.get_transcript(vidId, languages=['en'])
    except:
        print("!Translating from Ru to En")
        transcript = transcript_list.find_transcript(['ru'])
        captions = transcript.translate('en').fetch()
        #print(captions)
        #input()
    text = ""
    #print(captions)
    for element in captions:
        text+=element['text']+" "
    return text.split(" ")


def getPieceByKeyWords(wordList, keyWords, backward, forward):
    """
    The function returns a piece of caption text in a form of list with given span
    
    Input:
    wordList (list(str)): input list of the caption text
    keyWords (list(str)): keywords to search for
    backward (int): number of words to include before found keyword
    forward (int): number of words to include before found keyword
    
    Output:
    list(str): list of words close to a given keyword
    """
    
    lst = []
    i = 0
    for word in wordList:
        if word in keyWords:
            rear = max(0, i - backward)
            front = min(len(wordList)-1, i+forward)
            lst.append(wordList[rear:front])
        i+=1
    return lst


def antiScore(score):
    """
    A function that returns a penalyzing score for word "not" according to the function
    
    Input: 
    score (float): input score
    
    Output:
    float: a penalty to be added to the initial score if word "not" is present in the word list
    
    """
    
    return 0.58 - 1.15*score


def assessPieceOfText(pieceOfText, dictScores):
    """
    The function returns score for the piece of text given as input list of words according to dictScores 
    
    Input:
    pieceOfText (list(str)): keywords to search for
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    
    Output:
    float: score
    """
    
    score = 0
    lastScore = 0
    pieceOfText = list(set(pieceOfText))
    print(pieceOfText)
    for word in pieceOfText:
        for key in dictScores:
            if word in dictScores[key][1]:
                lastScore = dictScores[key][0]
                print(word,":",lastScore)
        if word == "not":
            lastScore = lastScore + antiScore(lastScore)
        score += lastScore
        lastScore=0
    return score


def assessVideoByKeyWords(keyWords, videoId, dictScores):
    """ 
    The function returns score for the captions of a youtube video (using videoId), key words and dictScores
    
    keyWords (list(str)): keywords to search for
    pieceOfText (list(str)): input text
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    videoId (str): link to youtube video
    
    Output:
    float: total Score
    """
    
    videoId = videoId.split("v=")[1]
    captionList = readCaptionsToList(videoId)

    pieces = getPieceByKeyWords(wordList = captionList, 
                       keyWords = keyWords,
                       backward = 10, 
                       forward = 10)

    totalScore = 0
    for piece in pieces:
        currentScore = assessPieceOfText(piece, dictScores)
        print("Score for the piece above:", currentScore)
        totalScore+=currentScore
        print("\n")
    print("Total Score =", totalScore, " based on ", len(pieces), " pieces")
    return totalScore


def assessUrlsByKeyWords(keyWords, urlList, dictScores):
    """ 
    The function returns mean score for the text provide by urlList based on key words and dictScores
    
    Input:
    keyWords (list(str)): keywords to search for
    urlList (list(str)): list of urls
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    videoId (str): link to youtube video
    
    Output:
    float: total mean Score
    """

    scores = [] 
    for url in urlList[1:]:
        print("Processing: ", url, "for item: ", urlList[0]) 
        if "youtube" in url.split("."):
            try:
                scores.append(assessVideoByKeyWords(keyWords, url, dictScores))
            except:
                print(f"---------------PASSED: {url}")
                pass
        else:
            try:
                scores.append(assessTextByKeyWords(keyWords, url, dictScores))
            except:
                print(f"---------------PASSED: {url}")
                pass
    return sum(scores)/len(scores)
 
    
def tag_visible(element):
    """
    The function defines if element is visible
    
    Input:
    html element
    
    Output:
    bool: True if visible, False otherwise
    """
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    """
    The function returns visible text from html
    
    Input:
    html 
    
    Output:
    str: text
    """
    
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)        
        
    
def assessTextByKeyWords(keyWords, url, dictScores):
    """ 
    The function returns score for the text provide by urlList based on key words and dictScores
    
    Input:
    keyWords (list(str)): keywords to search for
    url (str): url of textual web page
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    
    Output:
    float: total Score
    """
    
    html = urllib.request.urlopen(url).read()
    textList = text_from_html(html).lower().split(" ")
    pieces = getPieceByKeyWords(wordList = textList, 
                   keyWords = keyWords,
                   backward = 10, 
                   forward = 10)
    
    totalScore = 0
 
    for piece in pieces:
        currentScore = assessPieceOfText(piece, dictScores)
        print("Score for the piece above:", currentScore)
        totalScore+=currentScore
        print("\n")
    print("Score for text =", totalScore, " based on ", len(pieces), " pieces")
    return totalScore


def groupScore(testSet, keyWords, dictScores):
    
    """ 
    The function returns score for the text provide by urlList based on key words and dictScores
    
    Input:
    testSet(list(str)): list of lists of urls: [["name1", "url_1", "url_2", "url_n"],
                                                ["name2", "url_1", "url_2", "url_n"]]                                         
    keyWords (list(str)): keywords to search for
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}

    
    Output:
    float: total Score
    """
    
    scoresForItems = []
    for item in testSet:
        urlList = item
        scoresForItems.append(assessUrlsByKeyWords(keyWords, urlList, dictScores))

    print("\nTotal Results:\n")
    for i,item in enumerate(testSet):
        print(item[0],":",scoresForItems[i])

        
def getSynonimList(word):
    """ 
    The function returns list of synonims base on wordnet.synsets of nltk library
    for the text provide by urlList based on key words and dictScores
    
    Input:
    word (str): input word
    
    Output:
    list(str)
    """
    
    synonyms = [] 

    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            
    return (list(set(synonyms)))


def getGroupFromJSON(path):
    # return testSet, mascaraKeyWords, dictScores
    pass


def getDictScoresFromJSON(path):
    # return dictScores
    pass

In [86]:
hondaCB650r = ["Honda CB650R",
               "https://www.youtube.com/watch?v=lx8ygAcWo7o&t=1054s",
              "https://www.youtube.com/watch?v=eOW9jgCahnk&t=371s",
              "https://www.youtube.com/watch?v=b3j_r8VWx40",
              "https://www.motorcyclenews.com/bike-reviews/honda/cb650r/2019/"]

mgv7iii = ["Moto Guzzi V7 III",
           "https://www.youtube.com/watch?v=vLmw2Rkzjfw",
          "https://www.youtube.com/watch?v=iddepwKWdVM",
          "https://www.youtube.com/watch?v=CytQq_Ivq0g",
          "https://www.youtube.com/watch?v=h61CF2V2eG8",
           "https://www.motorcyclenews.com/bike-reviews/moto-guzzi/v7-stone/2014/",
           "http://www.motorcycledaily.com/2017/11/2017-moto-guzzi-v7-iii-stone-md-ride-review/"]

hdStreetRod = ["Harley Davison Street Rod", 
               "https://www.youtube.com/watch?v=RlkaAt9pdlU",
              "https://www.youtube.com/watch?v=lJW-zEGvqGc",
              "https://www.youtube.com/watch?v=u3iHnO6QYz8",
              "https://www.visordown.com/reviews/first-ride/first-ride-harley-davidson-street-rod-review"]

dctIcon = ["Ducati Icon",
          "https://www.youtube.com/watch?v=nWyiF1C1D70",
          "https://www.youtube.com/watch?v=RQOCjApCNQs",
          "https://www.youtube.com/watch?v=nHJg6GyyOWU",
          "https://www.bennetts.co.uk/bikesocial/reviews/bikes/ducati/2019-ducati-scrambler-icon-road-test-review"]

hsqVp701 = ["Husqvarna Vitpilen 701",
           "https://www.youtube.com/watch?v=PO2uFDS1P3A",
           "https://www.youtube.com/watch?v=GQK79vCohC4",
           "https://www.youtube.com/watch?v=5AR5PwffLzI",
           "https://www.youtube.com/watch?v=c0SL4pBJP4Y",
           "https://ridermagazine.com/2020/06/05/2020-husqvarna-vitpilen-701-road-test-review/"]

MotoTestSet = [hondaCB650r, mgv7iii, hdStreetRod, dctIcon, hsqVp701]
priceKeyWords = ["price", "cost"]


In [48]:
MascaraTestSet = [["Essence Mascara", 
            "https://www.youtube.com/watch?v=dvx3JmCKvZI",
            "https://www.youtube.com/watch?v=37-oMmP9JdQ"],
           
           ["L'Oreal Lash Paradise Mascara",
          "https://www.youtube.com/watch?v=blCSsS1z9hI",
          "https://www.youtube.com/watch?v=EK9pVDna36A",
          "https://www.youtube.com/watch?v=tj_cDRbgtpQ"],
           
           ["Vivienne Sabo Cabaret",
          "https://www.youtube.com/watch?v=p-zEnxpr_Yo",
           "https://www.youtube.com/watch?v=vVp10dWccZc",
           "https://www.youtube.com/watch?v=dnkSPEUpMCE",
           "https://www.youtube.com/watch?v=oh2lE-s4dns"],
           
           ["Maybelline Lash Sensational",
          "https://www.youtube.com/watch?v=5Xnxvs4_VkY",
          "https://www.youtube.com/watch?v=rWDNz44bM68",
          "https://www.youtube.com/watch?v=Y8i7FoNv6XQ"]]
mascaraKeyWords = ["price"]

In [49]:
dictScores = {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]+veryWords], 
              "B":[4,["pretty","good", "nice"]+prettyWords],
              "C":[3, ["enough", "affordable","low","decent", "quite"]+keyWords+enoughWords],
              "D":[-2,["weak","minimum","little"]+weakWords]}

In [87]:
groupScore(MotoTestSet, priceKeyWords, dictScores)

Processing:  https://www.youtube.com/watch?v=lx8ygAcWo7o&t=1054s for item:  Honda CB650R
['see', 'want', 'HST', 'dollars', 'the', 'what', 'you', 'gonna', 'with', "it's", 'cost', 'another', 'basically', 'version', 'ABS', '$9,200', 'three', 'hundred']
cost : 3
Score for the piece above: 3


['asking', 'buy', 'almost', 'office', 'the', 'price', 'it', 'would', 'to', 'motorcyclists', 'for', 'this', '$8,900', 'now', 'I', 'gosh', 'here', 'motorcycle']
price : 3
Score for the piece above: 3


['very', 'bit', 'well', 'know', 'little', 'I', 'but', "don't", 'built', 'me', 'price', 'to', 'eighty', 'is', 'high', 'a', 'the', 'nine', 'motorcycle']
very : 5
little : -2
price : 3
Score for the piece above: 6


Total Score = 12  based on  3  pieces
Processing:  https://www.youtube.com/watch?v=eOW9jgCahnk&t=371s for item:  Honda CB650R
Total Score = 0  based on  0  pieces
Processing:  https://www.youtube.com/watch?v=b3j_r8VWx40 for item:  Honda CB650R
['much', 'than', 'expected', "they're", 'not', 'know'

['forget', 'so', 'everything', 'how', 'it', "w's", 'never', 'I', 'go', 'much', 'does', 'wanted', 'to', 'we', 'finally', 'nearly', 'the', 'two', 'cost', 'there']
much : 5
much : 4
cost : 3
Score for the piece above: 7


['thing', 'vid', 'in', 'pills', 'about', 'that', 'got', 'gonna', 'ring', 'talk', 'price', 'US', 'last', 'to', 'we', 'is', 'market', 'the', 'and']
price : 3
Score for the piece above: 3


Total Score = 10  based on  2  pieces
Processing:  https://www.youtube.com/watch?v=5AR5PwffLzI for item:  Husqvarna Vitpilen 701
['fun', 'up', 'rotor', 'lot', 'digital', "it's", 'of', 'handling', 'grams', 'asking', 'price', '-', '12', 'high', 'a', 'at', 'front', 'far', 'as']
price : 3
Score for the piece above: 3


['asking', 'a', 'ride', 'at', 'kind', 'grand', 'price', 'again', "that's", 'really', "it's", 'to', 'for', 'of', '12', 'motorcycle', 'but', 'high']
price : 3
Score for the piece above: 3


Total Score = 6  based on  2  pieces
Processing:  https://www.youtube.com/watch?v=c0SL4pB

In [81]:
#Setup in json
import json
path = ""
with open("moto.json", "r") as read_file:
    motoData = json.load(read_file)
    
md = motoData['items']
md[0]['urls']

['https://www.youtube.com/watch?v=vLmw2Rkzjfw',
 'https://www.youtube.com/watch?v=iddepwKWdVM',
 'https://www.youtube.com/watch?v=CytQq_Ivq0g',
 'https://www.youtube.com/watch?v=h61CF2V2eG8']

In [82]:
getSynonimList("price")

['Price',
 'damage',
 'price',
 'terms',
 'monetary_value',
 'Mary_Leontyne_Price',
 'cost',
 'toll',
 'Leontyne_Price']

In [88]:
prettyWords = prettyWords - veryWords - enoughWords - weakWords

TypeError: unsupported operand type(s) for -: 'list' and 'list'