In [184]:
from youtube_transcript_api import YouTubeTranscriptApi
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import numpy as np
from charwords import veryWords, prettyWords, enoughWords, weakWords
from tqdm import tqdm
import nltk 
from nltk.corpus import wordnet 
from nltk.corpus import opinion_lexicon
#nltk.download('opinion_lexicon')
from nltk.sentiment import vader
#nltk.download('vader_lexicon')
from nltk.tokenize import treebank
from nltk.corpus import stopwords
from deepsegment import DeepSegment
from scipy.stats import hmean
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from youtubesearchpython import VideosSearch
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [171]:
def getYouTubeLinksFromSearch(query, maxNumber = 5):
    """
    The function returns a list of youtube urls from the query results
    
    Input:
    query:      str
                a string for searching youtube videos
                
    maxNumber:  int, default: 5
                number of urls for output,

    Output:
    list(str): list of urls
    
    """
    videosSearch = VideosSearch(query, limit = 5, region='US')
    ids = []
    for i in videosSearch.result()['result']:
        ids.append('https://www.youtube.com/watch?v='+i['id'])
        #print(i['title'])
    return ids

#Test
# getYouTubeIDsFromSearch('iphone 11 pro')

## Fetching textual data

In [210]:
def getTextFromYoutubeCaptions(vidId):
    """
    The function gets text from captions in the YouTube video, ID of which is given as an input.

    Input:
    vidId: str
           YouTube Video ID

    Output:
    str: list of words from the captions
    """
    
    transcript_list = YouTubeTranscriptApi.list_transcripts(vidId)
    captions = ["NoText"]
    try:
        captions = YouTubeTranscriptApi.get_transcript(vidId, languages=['en'])
    except:
        #print("!Translating captions to English")
        transcript = transcript_list.find_transcript(['ru','it'])
        captions = transcript.translate('en').fetch()
        #print(captions)
        #input()
    text = ""
    #print(captions)
    for element in captions:
        text+=element['text']+" "
    text = text.replace("\n"," ")
    return text

#Testing function
# getTextFromYoutubeCaptions("eOW9jgCahnk")[:100]


In [174]:
def tag_visible(element):
    """
    The function defines if element is visible
    
    Input:
    html element
    
    Output:
    bool: True if visible, False otherwise
    """
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def getTextFromUrl(url):
    """
    The function returns visible text from html
    
    Input:
    html 
    
    Output:
    str: text
    """
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)
#Testing
# getTextFromUrl('https://www.ilpost.it/2021/03/19/cina-stati-uniti-alaska-blinken/')[:1000]

## Scoring functions
In this sections scoring functions are defined. They return score for a given piece of text. The higher the score the more positive text is. Further these functions will be used to assess pieces of text around keywords.

In [175]:
def assessPolarity(text):
    """
    Polarity assessment based on Liu and Hu opinion lexicon
    """
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    wordsList = [word for word in wordsList if not word in stop_words]
    scores = []
    
    for word in wordsList:
        if word in set(opinion_lexicon.positive()):
            score = 1
        elif word in set(opinion_lexicon.negative()):
            score=-1
        else:
            score=0
        scores.append(score)
#     print(words)
#     print(scores)
    return np.sum(scores)

#Testing 
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarity(text)

In [176]:
def assessPolarityVader(text, split_sentences = False):
    """
    Polarity assessment based on Vader
    """
    sia = vader.SentimentIntensityAnalyzer()
    #it's better to feed vader with phrases or sentences
    #but as soon as caption text does not contain punctuation i used deepsegment library 
    #to split the caption text into sentences
    score = 0
    if split_sentences:
        segmenter = DeepSegment('en')
        sentenceList = segmenter.segment_long(text) 
        
        for sentence in sentenceList:
            s = sia.polarity_scores(sentence)['compound']
#             print(sentence)
#             print('Score:', s)
#             print('-'*20)
            score+=s
    else:
        score+=sia.polarity_scores(text)['compound']
    return score

#testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarityVader(text, split_sentences = True)

In [177]:
def antiScore(score):
    """
    A function that returns a penalyzing score for word "not" according to the function
    
    Input: 
    score (float): input score
    
    Output:
    float: a penalty to be added to the initial score if word "not" is present in the word list
    
    """
    
    return 0.58 - 1.15*score


def assessPolarityCustom(text, dictScores=None):
    """
    The function returns score for the piece of text given as input list of words according to dictScores 
    
    Input:
    pieceOfText (list(str)): keywords to search for
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    
    Output:
    float: score
    """
    if dictScores == None:
        dictScores = {"A":[5,["very", "extremely", "surprisingly","great", "much", "plenty","incredibly"]+veryWords], 
              "B":[4,["pretty","good", "nice"]+prettyWords],
              "C":[3, ["enough", "affordable","low","decent", "quite"]+enoughWords],
              "D":[-2,["weak","minimum","little"]+weakWords]}
        
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    wordsList = [word for word in wordsList if not word in stop_words]
    
    score = 0
    lastScore = 0
    wordsList = list(set(wordsList))
    #print(wordsList)
    for word in wordsList:
        for key in dictScores:
            if word in dictScores[key][1]:
                lastScore = dictScores[key][0]
                #print(word,":",lastScore)
        if word == "not":
            lastScore = lastScore + antiScore(lastScore)
        score += lastScore
        lastScore=0
    return score

#Testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarityCustom(text)

## Working with keywords 
In this section the function is defined which allows to extract a set amount of words around a key word.

In [178]:
def getPieceByKeyWords(text, keyWords, backward=5, forward=5):
    """
    The function returns a piece of caption text in a form of list with a given range.
    
    Input:
    text (str): input list of the caption text
    keyWords (list(str)): keywords to search for
    backward (int): number of words to include before found keyword
    forward (int): number of words to include before found keyword
    
    Output:
    list(str): list of words close to a given keyword
    """
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    
    lst = []
    i = 0
    
    for word in wordsList:
        if word in set(keyWords):
            rear = max(0, i - backward)
            front = min(len(wordsList)-1, i+forward)
            lst.append(" ".join(wordsList[rear:front]))
        i+=1
    return lst

#Testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# getPieceByKeyWords(text, ['engine'])

In [207]:

def getScores(dct, keyWords):
    scoresDict = defaultdict()
    
    for url in dct['urls']:
        try:
            if 'youtube' in url:
                text = getTextFromYoutubeCaptions(url.split('=')[1])
            else:
                text = getTextFromUrl(url)
                
            for keyWord in keyWords:
                textList = getPieceByKeyWords(text,[keyWord])
                aP, aPV, aPC = 0, 0, 0
                for t in textList:
                    aP+=assessPolarity(t)
                    aPV+=assessPolarityVader(t)
                    aPC+=assessPolarityCustom(t)
                scoresDict[keyWord] = [aP,aPV,aPC]
        except:
            pass
        
    df = pd.DataFrame(scoresDict)
    df = df.T
    #mms = MinMaxScaler()
    #df = mms.fit_transform(df)
    df = pd.DataFrame(df.T, columns=keyWords)
    df2 = pd.DataFrame({'name':[dct['name']]})
    df2 = pd.concat([df2,pd.DataFrame(np.mean(df)).T],axis = 1)
    return df2

#Testing            
# dct = {'name':"Honda CB650R",'urls':["https://www.youtube.com/watch?v=PO2uFDS1P3A",
#            "https://www.youtube.com/watch?v=GQK79vCohC4",
#            "https://www.youtube.com/watch?v=5AR5PwffLzI",
#            "https://www.youtube.com/watch?v=c0SL4pBJP4Y",
#            "https://ridermagazine.com/2020/06/05/2020-husqvarna-vitpilen-701-road-test-review/"]}

# print(getScores(dct, ['comfort','engine', 'design', 'handling']))

In [179]:
def getSynonimList(word):
    """ 
    The function returns list of synonims base on wordnet.synsets of nltk library
    for the text provide by urlList based on key words and dictScores
    
    Input:
    word (str): input word
    
    Output:
    list(str)
    """
    
    synonyms = [] 

    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name().lower()) 
    print(synonyms)    
    return (list(set(synonyms)))
#Testing
#getSynonimList('cost')

In [108]:
df = pd.DataFrame()
for m in tqdm(motoSet):
    df = pd.concat([df, getScores(m,kWords)])
df['total']=np.mean(df.iloc[:,1:], axis = 1)
df

100%|██████████| 5/5 [01:02<00:00, 12.58s/it]


Unnamed: 0,name,comfort,engine,torque,speed,design,handling,price,cost,total
0,Honda CB650R,0.1204,8.5715,3.615,0.801533,0.0,1.333333,1.836867,0.528633,2.100908
0,Moto Guzzi V7 III,0.0,2.225267,0.1468,0.1998,0.0,0.333333,1.807167,0.0,0.589046
0,Harley Davison Street Rod,0.0,0.8313,0.0,0.0,0.0,2.233433,0.0,0.0,0.383092
0,Ducati Icon,0.1204,0.5286,0.0,1.2023,0.190633,1.0,1.4673,0.130633,0.579983
0,Husqvarna Vitpilen 701,0.907467,0.590467,-0.1858,3.0711,4.134933,2.569867,0.9517,0.0,1.504967


In [115]:
df = pd.DataFrame()
for m in tqdm(carSet):
    df = pd.concat([df, getScores(m,carKeyWords)])
df['total']=np.mean(df.iloc[:,1:], axis = 1)
df

 50%|█████     | 1/2 [00:09<00:09,  9.44s/it]

!Translating captions to English


100%|██████████| 2/2 [00:16<00:00,  8.17s/it]


Unnamed: 0,name,reliability,quality,price,design,cost,engine,acceleration,speed,total
0,Kia K5,0.0,1.813467,0.0,2.186367,0.0,0.0,0.0,0.0,0.499979
0,Toyota Camry,0.0,0.545633,1.8488,2.3352,0.0,1.333333,0.0,0.0,0.757871


In [125]:
df = pd.DataFrame()
for m in tqdm(MascaraLst):
    df = pd.concat([df, getScores(m,mascaraKeyWords)])
df['total']=np.mean(df.iloc[:,1:], axis = 1)
df

 50%|█████     | 2/4 [00:05<00:05,  2.65s/it]

!Translating captions to English
!Translating captions to English
!Translating captions to English


100%|██████████| 4/4 [00:15<00:00,  3.86s/it]


Unnamed: 0,name,price,quality,color,hold,total
0,Essence Mascara,6.0091,0.0,0.0,0.528633,1.634433
0,L'Oreal Lash Paradise Mascara,0.480133,0.0,0.333333,0.0,0.203367
0,Vivienne Sabo Cabaret,0.0,0.0,0.091067,0.0,0.022767
0,Maybelline Lash Sensational,0.0,0.0,0.0,0.0,0.0


In [128]:
smartphonesSet =[{'name':'Samsung A52', 'urls':['https://www.youtube.com/watch?v=oPLZ0IjS4V8', 
                                               'https://www.youtube.com/watch?v=wdnFuV8bhP8', 
                                               'https://www.youtube.com/watch?v=GLIwBS5BnJI']},
               {'name':'Xiaomi Mi 10 Pro', 'urls':['https://www.youtube.com/watch?v=qEU1vBdP7H4',
                                                   'https://www.youtube.com/watch?v=VbFjbmB2upU',
                                                  'https://www.youtube.com/watch?v=4D7DEkIEH_g']},
               {'name':'iPhone 11 Pro','urls':['https://www.youtube.com/watch?v=nxf41fMX_Y4',
                                  'https://www.youtube.com/watch?v=0gzyuC2YlTE',
                                 'https://www.youtube.com/watch?v=DyX-QZZBgpw']}]
smartphonesKeyWords = ['price','camera','screen','surface','size','battery','sound','design']
df = pd.DataFrame()
for m in tqdm(smartphonesSet):
    df = pd.concat([df, getScores(m,smartphonesKeyWords)])
df['total']=np.mean(df.iloc[:,1:], axis = 1)
df

100%|██████████| 3/3 [00:21<00:00,  7.02s/it]


Unnamed: 0,name,price,camera,screen,surface,size,battery,sound,design,total
0,Samsung A52,2.922367,6.750033,2.3552,0.0,0.0,0.0,0.0,0.0,1.50345
0,Xiaomi Mi 10 Pro,0.0,1.209733,0.3967,0.0,0.164233,0.5309,0.409,0.0,0.338821
0,iPhone 11 Pro,1.666667,5.618833,0.274033,0.0,0.0,9.5616,5.6269,0.0,2.843504


In [6]:
keyWords = carKeyWords
dictScores = {"A":[5,["very", "extremely", "surprisingly","great", "much", "plenty","incredibly"]+veryWords], 
              "B":[4,["pretty","good", "nice"]+prettyWords],
              "C":[3, ["enough", "affordable","low","decent", "quite"]+keyWords+enoughWords],
              "D":[-2,["weak","minimum","little"]+weakWords]}

In [22]:
groupScore2(carSet, carKeyWords, dictScores)

Processing:  https://www.youtube.com/watch?v=uTsRyAtLQIE for item:  Kia K5
---------------PASSED: https://www.youtube.com/watch?v=uTsRyAtLQIE
Processing:  https://www.youtube.com/watch?v=VDlnLDppTcY for item:  Kia K5
---------------PASSED: https://www.youtube.com/watch?v=VDlnLDppTcY
Processing:  https://www.youtube.com/watch?v=DLhqrHG3138 for item:  Kia K5
---------------PASSED: https://www.youtube.com/watch?v=DLhqrHG3138
Processing:  https://www.youtube.com/watch?v=Olvj3ooBvRI for item:  Kia K5
---------------PASSED: https://www.youtube.com/watch?v=Olvj3ooBvRI


ZeroDivisionError: division by zero

In [None]:

def getSetScores(productNames, productKeyWords, plot = False):
    
    productSet = []
    for product in productNames:
        dct = defaultdict()
        dct['name']=product
        dct['urls']=getYouTubeLinksFromSearch(product)
        productSet.append(dct)
    
    df = pd.DataFrame()
    for m in productSet:
        df = pd.concat([df, getScores(m,productKeyWords)])
        
    df['total']=np.mean(df.iloc[:,1:], axis = 1)
    if plot:
        numOfPlots = int(np.sum([np.sum(df[x].sum()!=0)>0 for x in df.columns[1:]]))
        
        fig = make_subplots(rows=numOfPlots, cols=1,
                    vertical_spacing=0.02)
        
        r=1
        for n in df.columns[1:]:
            if np.sum(df[n].sum()!=0)>0:
                fig.add_trace(go.Bar(x=df['name'], y=df[n], name = n),
                              row=r, col=1,)
                r+=1
        fig.update_layout(height=200*numOfPlots)
        fig.show()
        fig.write_html("result.html")
    
    df.to_html('df.html')
    return df

In [None]:
# productKeyWords = ["reliability", "quality", 'reliable', 'design', 'cost', 'engine', 'acceleration', 'speed']
# productNames = ['volvo xc60', 'bmw x3', 'audi q5', 'cadillac xt5', 'mini countryman']
productKeyWords = ['price','camera','screen','materials','body','size','battery','sound','design']
productNames = ['iphone 11 pro','Samsung A52','Xiaomi Mi 10 Pro','iphone 12 pro', 'google pixel 5', 'google pixel 4 xl']
getSetScores(productNames,productKeyWords,plot=True)

links obtained
