In [37]:
from youtube_transcript_api import YouTubeTranscriptApi

In [47]:
def readCaptionsToList(vidId):
    """
    The function creates a word list from captions in the video given as input.

    Input:
    YouTube Video ID

    Output:
    list (str): list of words from the captions
    """
    transcript_list = YouTubeTranscriptApi.list_transcripts(vidId)
    
    try:
        captions = YouTubeTranscriptApi.get_transcript(vidId, languages=['en'])
    except:
        print("!Translating from Ru to En")
        transcript = transcript_list.find_transcript(['ru'])
        captions = transcript.translate('en').fetch()
        #print(captions)
        #input()
    text = ""
    #print(captions)
    for element in captions:
        text+=element['text']+" "
    return text.split(" ")

def getPieceByKeyWords(wordList, keyWords, backward, forward):
    """
    The function returns a piece of caption text in a form of list with given span
    
    Input:
    wordList (list(str)): input list of the caption text
    keyWords (list(str)): keywords to search for
    backward (int): number of words to include before found keyword
    forward (int): number of words to include before found keyword
    
    Output:
    list(str): list of words close to a given keyword
    
    """
    
    lst = []
    i = 0
    for word in wordList:
        if word in keyWords:
            rear = max(0, i - backward)
            front = min(len(wordList)-1, i+forward)
            lst.append(wordList[rear:front])
        i+=1
    return lst

def antiScore(score):
    """
    A function that returns a penalyzing score for word "not" according to the function
    
    Input: 
    score (float): input score
    
    Output:
    float: a penalty to be added to the initial score if word "not" is present in the word list
    
    """
    return 0.58 - 1.15*score

def assessPieceOfText(pieceOfText):
    """
    
    """
    score = 0
    lastScore = 0
    
    for word in pieceOfText:
        for key in dictScores:
            if word in dictScores[key][1]:
                lastScore = dictScores[key][0]
                print(word,":",lastScore)
        if word == "not":
            lastScore = lastScore + antiScore(lastScore)
        score += lastScore
        lastScore=0
    return score

def assessVideoByKeyWords(keyWords, videoId, dictScores):
    """ 
    """
    captionList = readCaptionsToList(videoId)

    pieces = getPieceByKeyWords(wordList = captionList, 
                       keyWords = keyWords,
                       backward = 10, 
                       forward = 10)
    
    totalScore = 0
    for piece in pieces:
        print(piece)
        currentScore = assessPieceOfText(piece)
        print("Score for the piece above:", currentScore)
        totalScore+=currentScore
        print("\n")
    print("Total Score =", totalScore, " based on ", len(pieces), " pieces")
    return totalScore

def assessUrlsByKeyWords(keyWords, urlList, dictScores):
    scores = [] 
    
    for url in urlList[1:]:
        videoID = url.split("v=")[1]
        print("Processing: ", url, "for item: ", urlList[0]) 
        scores.append(assessVideoByKeyWords(keyWords, videoID, dictScores))
    return sum(scores)/len(scores)
        
        
        

In [52]:
hondaCB650r = ["Honda CB650R",
               "https://www.youtube.com/watch?v=lx8ygAcWo7o&t=1054s",
              "https://www.youtube.com/watch?v=eOW9jgCahnk&t=371s",
              "https://www.youtube.com/watch?v=b3j_r8VWx40"]

mgv7iii = ["Moto Guzzi V7 III",
           "https://www.youtube.com/watch?v=vLmw2Rkzjfw",
          "https://www.youtube.com/watch?v=iddepwKWdVM",
          "https://www.youtube.com/watch?v=CytQq_Ivq0g",
          "https://www.youtube.com/watch?v=h61CF2V2eG8"]

hdStreetRod = ["Harley Davison Street Rod", 
               "https://www.youtube.com/watch?v=RlkaAt9pdlU",
              "https://www.youtube.com/watch?v=lJW-zEGvqGc",
              "https://www.youtube.com/watch?v=u3iHnO6QYz8"]

dctIcon = ["Ducati Icon",
          "https://www.youtube.com/watch?v=nWyiF1C1D70",
          "https://www.youtube.com/watch?v=RQOCjApCNQs",
          "https://www.youtube.com/watch?v=nHJg6GyyOWU"]

hsqVp701 = ["Husqvarna Vitpilen 701",
           "https://www.youtube.com/watch?v=PO2uFDS1P3A",
           "https://www.youtube.com/watch?v=GQK79vCohC4",
           "https://www.youtube.com/watch?v=5AR5PwffLzI",
           "https://www.youtube.com/watch?v=c0SL4pBJP4Y"]

#testSet = [hondaCB650r, mgv7iii, hdStreetRod, dctIcon,hsqVp701]
testSet = [["Essence Mascara", 
            "https://www.youtube.com/watch?v=dvx3JmCKvZI",
            "https://www.youtube.com/watch?v=37-oMmP9JdQ"],
           
           ["L'Oreal Lash Paradise Mascara",
          "https://www.youtube.com/watch?v=blCSsS1z9hI",
          "https://www.youtube.com/watch?v=EK9pVDna36A",
          "https://www.youtube.com/watch?v=tj_cDRbgtpQ"],
           
           ["Vivienne Sabo Cabaret",
          "https://www.youtube.com/watch?v=p-zEnxpr_Yo",
           "https://www.youtube.com/watch?v=vVp10dWccZc",
           "https://www.youtube.com/watch?v=dnkSPEUpMCE",
           "https://www.youtube.com/watch?v=oh2lE-s4dns"],
           
           ["Maybelline Lash Sensational",
          "https://www.youtube.com/watch?v=5Xnxvs4_VkY",
          "https://www.youtube.com/watch?v=rWDNz44bM68",
          "https://www.youtube.com/watch?v=Y8i7FoNv6XQ"]]
#SetUp
#keyWords = ["comfort","ergonomics","comfortable"]

#keyWords = ["flaking","clumping","flake","clump"]
keyWords =["inexpensive", "cheap", "affordable", "price","pay", "cost"]
dictScores = {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}

scoresForItems = []
for item in testSet:
    urlList = item
    scoresForItems.append(assessUrlsByKeyWords(keyWords, urlList, dictScores))


Processing:  https://www.youtube.com/watch?v=dvx3JmCKvZI for item:  Essence Mascara
['and', 'see', 'if', 'they', 'are', 'actually', 'any', 'decent', 'for', 'their', 'price', 'I', 'have', 'my', 'receipt', 'here', 'so', 'I', 'paid', '$6']
decent : 3
Score for the piece above: 3


['six', 'dollars', 'each', 'from', 'mascara', 'which', 'in', 'Australia', 'is', 'incredibly', 'cheap', 'because', 'in', 'a', 'way', "you're", 'looking', 'at', 'about', 'twenty']
incredibly : 5
cheap : 3
Score for the piece above: 8


['literally', 'maybe', 'like', 'a', '5', 'out', 'of', '10', 'for', 'that', 'price', "they're", 'decent', 'not', 'great', 'whatever', 'I', "don't", 'think', "they're"]
decent : 3
great : 5
Score for the piece above: 8.58


Total Score = 19.58  based on  3  pieces
Processing:  https://www.youtube.com/watch?v=37-oMmP9JdQ for item:  Essence Mascara
['it', 'is', 'three', 'or', 'four', 'ninety-nine', 'I', 'will', 'have', 'the', 'price', 'on', 'the', 'screen', 'right', 'now', 'I', 'bought'

In [51]:
print("\nTotal Results:\n")
for i,item in enumerate(testSet):
    print(item[0],":",scoresForItems[i])


Total Results:

Essence Mascara : 18.869999999999997
L'Oreal Lash Paradise Mascara : 7.333333333333333
Vivienne Sabo Cabaret : 1.0
Maybelline Lash Sensational : 4.0


In [56]:
prices = [4.99, 11, 6, 10]
r = []
for i,j in zip(prices, scoresForItems):
    r.append(i/j)
sum(r)/len(r)

1.7327768945416004

In [53]:
scoresForItems

[18.869999999999997, 7.333333333333333, 2.25, 4.0]

In [106]:
beautyWords = ["alluring",
"appealing",
"charming",
"cute",
"dazzling",
"delicate",
"delightful",
"elegant",
"exquisite",
"fascinating",
"fine",
"good-looking",
"gorgeous",
"graceful",
"grand",
"handsome",
"lovely",
"magnificent",
"marvelous",
"pleasing",
"splendid",
"stunning",
"superb",
"wonderful",
"admirable",
"angelic",
"beauteous",
"bewitching",
"classy",
"comely",
"divine",
"enticing",
"excellent",
"fair",
"foxy",
"ideal",
"nice",
"pulchritudinous",
"radiant",
"ravishing",
"refined",
"resplendent",
"shapely",
"sightly",
"statuesque",
"sublime",
"symmetrical",
"well-formed"]

In [47]:
C_words = ["not", "bad"]
"not" in C_words

True

In [69]:
assessPieceOfText(['we', 'can', 'relax', 'so', 'the', 'engine', 'has', 'large', 'amounts', 'of', 'torque', 'which', 'are', 'which', 'are', 'readily', 'accessible', 'no', 'problem', "let's"])


65