In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import numpy as np
from charwords import veryWords, prettyWords, enoughWords, weakWords
from tqdm import tqdm
import nltk 
from nltk.corpus import wordnet, opinion_lexicon, stopwords

from nltk.sentiment import vader

from nltk.tokenize import treebank

from deepsegment import DeepSegment
from scipy.stats import hmean
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from youtubesearchpython import VideosSearch
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#nltk.download('opinion_lexicon')
#nltk.download('vader_lexicon')

In [4]:
pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.4.10-py2.py3-none-any.whl (25 kB)
Collecting yarg
  Downloading yarg-0.1.9-py2.py3-none-any.whl (19 kB)
Installing collected packages: yarg, pipreqs
Successfully installed pipreqs-0.4.10 yarg-0.1.9
Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install youtube_transcript_api -U

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.4.1-py3-none-any.whl (22 kB)
Installing collected packages: youtube-transcript-api
  Attempting uninstall: youtube-transcript-api
    Found existing installation: youtube-transcript-api 0.3.1
    Uninstalling youtube-transcript-api-0.3.1:
      Successfully uninstalled youtube-transcript-api-0.3.1
Successfully installed youtube-transcript-api-0.4.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
def getYouTubeLinksFromSearch(query, maxNumber = 15):
    """
    The function returns a list of youtube urls from the query results
    
    Input:
    query:      str
                a string for searching youtube videos
                
    maxNumber:  int, default: 5
                number of urls for output,

    Output:
    list(str): list of urls
    
    """
    videosSearch = VideosSearch(query, limit = maxNumber, region='US')
    ids = []
    for i in videosSearch.result()['result']:
        ids.append('https://www.youtube.com/watch?v='+i['id'])
        #print(i['title'])
    return ids

#Test
getYouTubeLinksFromSearch('iphone 11 pro')

['https://www.youtube.com/watch?v=fSOvG-GeU48',
 'https://www.youtube.com/watch?v=9tenInPVNMM',
 'https://www.youtube.com/watch?v=45WjaQQlUoY',
 'https://www.youtube.com/watch?v=nxf41fMX_Y4',
 'https://www.youtube.com/watch?v=5jxkiqhXVYc',
 'https://www.youtube.com/watch?v=_Brp0xnmPKg',
 'https://www.youtube.com/watch?v=slMfX6FPpQo',
 'https://www.youtube.com/watch?v=kLH5MX1UiW0',
 'https://www.youtube.com/watch?v=KdqKCnUnRCY',
 'https://www.youtube.com/watch?v=s1wlPXzOoTk',
 'https://www.youtube.com/watch?v=0gzyuC2YlTE',
 'https://www.youtube.com/watch?v=u49nqZ4AbuM',
 'https://www.youtube.com/watch?v=-YyeaaDOWho',
 'https://www.youtube.com/watch?v=DyX-QZZBgpw',
 'https://www.youtube.com/watch?v=LsR2MQJIgRg']

## Fetching textual data

In [3]:
def getTextFromYoutubeCaptions(vidId):
    """
    The function gets text from captions in the YouTube video, ID of which is given as an input.

    Input:
    vidId: str
           YouTube Video ID

    Output:
    str: list of words from the captions
    """
    
    transcript_list = YouTubeTranscriptApi.list_transcripts(vidId)
    captions = ["NoText"]
    try:
        captions = YouTubeTranscriptApi.get_transcript(vidId, languages=['en'])
    except:
        #print("!Translating captions to English")
        transcript = transcript_list.find_transcript(['ru','it'])
        captions = transcript.translate('en').fetch()
        #print(captions)
        #input()
    text = ""
    #print(captions)
    for element in captions:
        text+=element['text']+" "
    text = text.replace("\n"," ")
    return text

#Testing function
getTextFromYoutubeCaptions("eOW9jgCahnk")[:100]


"what's going on everybody welcome to another episode of the beginner bike giveaway series if you've "

In [4]:
def tag_visible(element):
    """
    The function defines if element is visible
    
    Input:
    html element
    
    Output:
    bool: True if visible, False otherwise
    """
    
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def getTextFromUrl(url):
    """
    The function returns visible text from html
    
    Input:
    html 
    
    Output:
    str: text
    """
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)
#Testing
# getTextFromUrl('https://www.ilpost.it/2021/03/19/cina-stati-uniti-alaska-blinken/')[:1000]

## Scoring functions
In this sections scoring functions are defined. They return score for a given piece of text. The higher the score the more positive text is. Further these functions will be used to assess pieces of text around keywords.

In [5]:
def assessPolarity(text):
    """
    Polarity assessment based on Liu and Hu opinion lexicon
    """
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    wordsList = [word for word in wordsList if not word in stop_words]
    scores = []
    
    for word in wordsList:
        if word in set(opinion_lexicon.positive()):
            score = 1
        elif word in set(opinion_lexicon.negative()):
            score=-1
        else:
            score=0
        scores.append(score)
#     print(words)
#     print(scores)
    return np.sum(scores)

#Testing 
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarity(text)

In [6]:
def assessPolarityVader(text, split_sentences = False):
    """
    Polarity assessment based on Vader
    """
    sia = vader.SentimentIntensityAnalyzer()
    #it's better to feed vader with phrases or sentences
    #but as soon as caption text does not contain punctuation i used deepsegment library 
    #to split the caption text into sentences
    score = 0
    if split_sentences:
        segmenter = DeepSegment('en')
        sentenceList = segmenter.segment_long(text) 
        
        for sentence in sentenceList:
            s = sia.polarity_scores(sentence)['compound']
#             print(sentence)
#             print('Score:', s)
#             print('-'*20)
            score+=s
    else:
        score+=sia.polarity_scores(text)['compound']
    return score

#testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarityVader(text, split_sentences = True)

In [7]:
def antiScore(score):
    """
    A function that returns a penalyzing score for word "not" according to the function
    
    Input: 
    score (float): input score
    
    Output:
    float: a penalty to be added to the initial score if word "not" is present in the word list
    
    """
    
    return 0.58 - 1.15*score


def assessPolarityCustom(text, dictScores=None):
    """
    The function returns score for the piece of text given as input list of words according to dictScores 
    
    Input:
    pieceOfText (list(str)): keywords to search for
    dictScores (dict): a dictionary of words and corresponding scores in form: 
             {"A":[5,["very", "extremely", "surprisingly","great", "much", "incredibly"]], 
              "B":[4,["pretty","good", "nice"]],
              "C":[3, ["enough","inexpensive", "cheap", "affordable","low","decent", "quite"]],
              "D":[-2,["weak","minimum","little"]]}
    
    Output:
    float: score
    """
    if dictScores == None:
        dictScores = {"A":[5,["very", "extremely", "surprisingly","great", "much", "plenty","incredibly"]+veryWords], 
              "B":[4,["pretty","good", "nice"]+prettyWords],
              "C":[3, ["enough", "affordable","low","decent", "quite"]+enoughWords],
              "D":[-2,["weak","minimum","little"]+weakWords]}
        
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    wordsList = [word for word in wordsList if not word in stop_words]
    
    score = 0
    lastScore = 0
    wordsList = list(set(wordsList))
    #print(wordsList)
    for word in wordsList:
        for key in dictScores:
            if word in dictScores[key][1]:
                lastScore = dictScores[key][0]
                #print(word,":",lastScore)
        if word == "not":
            lastScore = lastScore + antiScore(lastScore)
        score += lastScore
        lastScore=0
    return score

#Testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# assessPolarityCustom(text)

## Working with keywords 
In this section the function is defined which allows to extract a set amount of words around a key word.

In [8]:
def getPieceByKeyWords(text, keyWords, backward=5, forward=5):
    """
    The function returns a piece of caption text in a form of list with a given range.
    
    Input:
    text (str): input list of the caption text
    keyWords (list(str)): keywords to search for
    backward (int): number of words to include before found keyword
    forward (int): number of words to include before found keyword
    
    Output:
    list(str): list of words close to a given keyword
    """
    stop_words = set(stopwords.words('english')) 
    tokenizer = treebank.TreebankWordTokenizer()
    wordsList = [word.lower() for word in tokenizer.tokenize(text)]
    
    lst = []
    i = 0
    
    for word in wordsList:
        if word in set(keyWords):
            rear = max(0, i - backward)
            front = min(len(wordsList)-1, i+forward)
            lst.append(" ".join(wordsList[rear:front]))
        i+=1
    return lst

#Testing
# text = getTextFromYoutubeCaptions("eOW9jgCahnk")
# getPieceByKeyWords(text, ['engine'])

In [9]:

def getScores(dct, keyWords):
    scoresDict = defaultdict()
    dctForHist = {'name':[],'keyword':[], 'aP':[], 'aPV':[],'aPC':[]}
    
    for url in dct['urls']:
        try:
            if 'youtube' in url:
                text = getTextFromYoutubeCaptions(url.split('=')[1])
            else:
                text = getTextFromUrl(url)
                
            for keyWord in keyWords:
                textList = getPieceByKeyWords(text,[keyWord])
                aP, aPV, aPC = 0, 0, 0
                for t in textList:
                    aP+=assessPolarity(t)
                    aPV+=assessPolarityVader(t)
                    aPC+=assessPolarityCustom(t)
                scoresDict[keyWord] = [aP,aPV,aPC]
                dctForHist['name'].append(dct['name'])
                dctForHist['keyword'].append(keyWord)
                dctForHist['aP'].append(aP)
                dctForHist['aPV'].append(aPV)
                dctForHist['aPC'].append(aPC)
        except:
            pass
    df3 = pd.DataFrame(dctForHist)  
    df3['mean']=np.mean(df3[['aP', 'aPV', 'aPC']],axis=1)
    
    df = pd.DataFrame(scoresDict)
    df = df.T
    #mms = MinMaxScaler()
    #df = mms.fit_transform(df)
    df = pd.DataFrame(df.T, columns=keyWords)
    df2 = pd.DataFrame({'name':[dct['name']]})
    df2 = pd.concat([df2,pd.DataFrame(np.mean(df)).T],axis = 1)
    return df2, df3

#Testing  
product = "Honda CB650R"
dct = {'name':product,'urls':getYouTubeLinksFromSearch(product)}

df2,df3 = getScores(dct, ['comfort','engine', 'design', 'handling'])

In [11]:
df3

Unnamed: 0,name,keyword,aP,aPV,aPC,mean


In [275]:
px.histogram(df3[df3['mean']!=0],x='mean', facet_col='keyword', facet_row='name', labels={'mean':'mean score'})

In [12]:
def getSynonimList(word):
    """ 
    The function returns list of synonims base on wordnet.synsets of nltk library
    for the text provide by urlList based on key words and dictScores
    
    Input:
    word (str): input word
    
    Output:
    list(str)
    """
    
    synonyms = [] 

    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name().lower()) 
    print(synonyms)    
    return (list(set(synonyms)))
#Testing
#getSynonimList('cost')

In [13]:
def getSetScores(productNames, productKeyWords, plot = False):
    """
    The function returns a dataframe with scores for products according to key words.
    
    Input:
    productNames    (list): input list of strings with product names
    productKeyWords (list): input list of strings with key words
    plot            (bool): default=False, if true plots barplots with scores and saves them to HTML ("result.html")
    
    Output:
    pandas dataframe with scores
    """
    productSet = []
    for product in productNames:
        dct = defaultdict()
        dct['name']=product
        dct['urls']=getYouTubeLinksFromSearch(product)
        productSet.append(dct)
    
    df = pd.DataFrame()
    for m in productSet:
        df = pd.concat([df, getScores(m,productKeyWords)])
        
    df['total']=np.mean(df.iloc[:,1:], axis = 1)
    if plot:
        numOfPlots = int(np.sum([np.sum(df[x].sum()!=0)>0 for x in df.columns[1:]]))
        
        fig = make_subplots(rows=numOfPlots, cols=1,
                    vertical_spacing=0.04)
        
        r=1
        for n in df.columns[1:]:
            if np.sum(df[n].sum()!=0)>0:
                fig.add_trace(go.Bar(x=df['name'], y=df[n], name = n),
                              row=r, col=1,)
                r+=1
        fig.update_layout(height=200*numOfPlots)
        fig.show()
        fig.write_html("result.html")
    
    df.to_html('df.html')
    return df

In [14]:
def getSetScoresText(productNames, productKeyWords, plot = False, maxNumber = 15):
    """
    The function returns a dataframe with scores for products according to key words.
    
    Input:
    productNames    (str): input string with product names separated by \n
    productKeyWords (str): input string with key words separated by \n
    plot            (bool): default=False, if true plots barplots with scores and saves them to HTML ("result.html")
    
    Output:
    pandas dataframe with scores
    """
    productNames = productNames.split('\n')
    productKeyWords = productKeyWords.split('\n')
    productSet = []
    for product in productNames:
        dct = defaultdict()
        dct['name']=product
        dct['urls']=getYouTubeLinksFromSearch(product, maxNumber=maxNumber)
        productSet.append(dct)
    
    df = pd.DataFrame()
    df2 = pd.DataFrame()
    for m in productSet:
        df = pd.concat([df, getScores(m,productKeyWords)[0]])
        df2 = pd.concat([df2, getScores(m,productKeyWords)[1]])
        
    df['total']=np.mean(df.iloc[:,1:], axis = 1)
    if plot:
        numOfPlots = int(np.sum([np.sum(df[x].sum()!=0)>0 for x in df.columns[1:]]))
        
        fig = make_subplots(rows=numOfPlots, cols=1,
                    vertical_spacing=0.04)
        
        r=1
        for n in df.columns[1:]:
            if np.sum(df[n].sum()!=0)>0:
                fig.add_trace(go.Bar(x=df['name'], y=df[n], name = n),
                              row=r, col=1,)
                r+=1
        fig.update_layout(height=200*numOfPlots)
        #fig.show()
        fig.write_html("result.html")
        
        #Histograms
        fig = px.histogram(df2[df2['mean']!=0],x='mean', facet_col='keyword', facet_row='name', labels={'mean':'mean score'})
        fig.write_html("result2.html")
    
    df.to_html('df.html')
    return df, df2

In [113]:
productNames = """volvo xc60
bmw x3
audi q5
cadillac xt5"""
productKeyWords = """reliability
quality
comfort
design
cost
engine
acceleration"""

df3 = getSetScoresText(productNames,productKeyWords,plot=True, maxNumber=30)[1]

In [114]:
df3

Unnamed: 0,name,keyword,aP,aPV,aPC,mean
0,volvo xc60,reliability,0,0.0000,0,0.000000
1,volvo xc60,quality,0,0.0000,0,0.000000
2,volvo xc60,comfort,2,1.1613,0,1.053767
3,volvo xc60,design,-1,0.0000,0,-0.333333
4,volvo xc60,cost,0,0.0000,4,1.333333
...,...,...,...,...,...,...
114,cadillac xt5,comfort,0,0.0000,0,0.000000
115,cadillac xt5,design,2,0.7402,0,0.913400
116,cadillac xt5,cost,0,0.0000,0,0.000000
117,cadillac xt5,engine,0,0.0000,0,0.000000


In [115]:
from scipy.stats import ttest_ind
from plotly.subplots import make_subplots

In [116]:
a = df3[(df3['name']=='volvo xc60') & (df3['keyword']=='design')]['mean']
b = df3[(df3['name']=='cadillac xt5') & (df3['keyword']=='design')]['mean']
print(np.var(a))
print(np.var(b))
print(ttest_ind(a,b, equal_var=False).pvalue)

2.120184581220851
1.1663294183314108
0.2404055729134673


In [117]:
thresholdPvalue = 0.1
#df5 = pd.DataFrame()
#f = make_subplots(rows=1, cols=len(df3['keyword'].unique()), 
#                  subplot_titles=df3['keyword'].unique(), shared_yaxes=True)
# c = 0
dct2 = {'name_pair':[],'keyword':[],'pvalue':[]}
for k in df3['keyword'].unique():
#     c+=1
    pvalues = []
    for n1 in df3['name'].unique():
        pv = []
        for n2 in df3['name'].unique():
            a = df3[(df3['name']==n1) & (df3['keyword']==k)]['mean']
            b = df3[(df3['name']==n2) & (df3['keyword']==k)]['mean']
            pv.append(ttest_ind(a,b,equal_var=False).pvalue)
            if n1!=n2:
                dct2['name_pair'].append(tuple(np.sort([n1,n2])))
                dct2['pvalue'].append(ttest_ind(a,b, equal_var=False).pvalue)
                dct2['keyword'].append(k)
        pvalues.append(pv)

#     dfT = pd.DataFrame(pvalues, index = df3['name'].unique(), columns=df3['name'].unique())
#     print(dfT)
#     dfT['keyword'] = [k for i in range(len(dfT))]
#     df5 = pd.concat([df5, dfT])
#     fi = px.imshow(df5.iloc[:,:-1], color_continuous_scale=[(0.00, "#ADFF2F"),   (thresholdPvalue, "#ADFF2F"),
#                                                      (0.33, "#D3D3D3"), (0.66, "#D3D3D3"),
#                                                      (0.66, "#D3D3D3"),  (1.00, "#D3D3D3")], title=k, zmin=0, zmax=1)
    
#     f.add_trace(fi.data[0], row = 1, col = c)
# f.show()
# f.update_layout(width=50*len(df3['keyword'].unique()),
#         height=50)

df7=pd.DataFrame(dct2)

In [118]:
df8 = pd.DataFrame()
for k in df7['keyword'].unique():
    df8 = pd.concat([df8,df7[df7['keyword']==k].drop_duplicates(subset='name_pair')])
df8 = df8.sort_values(by='pvalue')
df8 = df8[df8['pvalue']<0.3]
df8

Unnamed: 0,name_pair,keyword,pvalue
61,"(audi q5, volvo xc60)",engine,0.005694
60,"(bmw x3, volvo xc60)",engine,0.035448
2,"(cadillac xt5, volvo xc60)",reliability,0.070595
53,"(bmw x3, cadillac xt5)",cost,0.0727
29,"(bmw x3, cadillac xt5)",comfort,0.078093
62,"(cadillac xt5, volvo xc60)",engine,0.086974
1,"(audi q5, volvo xc60)",reliability,0.127068
68,"(audi q5, cadillac xt5)",engine,0.143047
50,"(cadillac xt5, volvo xc60)",cost,0.144544
24,"(bmw x3, volvo xc60)",comfort,0.156175


In [119]:
f = px.box(df3, y='mean', facet_row='keyword', x='name', color='keyword')
f.update_layout(height=1000)
f.show()

## Image fetching

In [57]:
dfIm2= dfIm.copy()

In [1]:
from google_images_search import GoogleImagesSearch

In [25]:
gkey='AIzaSyCM5mlR6oXN6zCD0bk-PAi5hjz6NfpBfW4'
gcx = '96941285ce448aa86'

images = []
for i in dfIm['name']:
    gis = GoogleImagesSearch(gkey, gcx)
    _search_params = {
        'q': i,
        'num': 1
    }
    gis.search(search_params=_search_params)
    images.append(gis.results()[0].url)
images
#path='/Users/inotin/Dropbox/DS/Personal/reviewAnalyzer/images'
# this will only search for images:
#gis.search(search_params=_search_params)

# this will search and download:

# i=0
# for image in gis.results()[:5]:
#     i+=1
#     image.download(path+str(i))
#     image.resize(500, 500)
# this will search, download and resize:
#gis.search(search_params=_search_params, path_to_dir=path)

# search first, then download and resize afterwards:
# gis.search(search_params=_search_params)
# for image in gis.results():
#     image.download('/path/')
#     image.resize(500, 500)

['https://www.volvocars.com/images/v/us/v/-/media/project/contentplatform/data/media/pdp/xc60-fuel/xc60-og.jpg?h=630&iar=0&w=1200',
 'https://upload.wikimedia.org/wikipedia/commons/5/5d/2018_BMW_X3_%28G01%29_xDrive30i_wagon_%282018-11-02%29_01.jpg',
 'https://thumbor.forbes.com/thumbor/fit-in/960x720/filters:format(jpg)/https://www.forbes.com/wheels/wp-content/uploads/2020/09/2021-audi-q5-sportback-coupe-5.png',
 'https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/2021-cadillac-xt5-mmp-1-1597436882.jpg?crop=0.796xw:0.599xh;0.0737xw,0.243xh&resize=1200:*']

In [45]:
dfIm2 = dfIm2.reset_index()
dfIm2 = dfIm2.drop(columns='index')
dfIm2

Unnamed: 0,name,reliability,quality,comfort,design,cost,engine,acceleration,total
0,volvo xc60,0.0,4.201533,0.907467,0.0,0.4244,4.207367,0.0,1.391538
1,bmw x3,0.503533,0.0,0.695233,0.4606,4.1665,-1.5909,0.0,0.604995
2,audi q5,0.0,4.4441,0.0,0.3548,0.0,0.518767,0.0,0.759667
3,cadillac xt5,0.0,0.0,1.2502,0.0,0.0,0.810267,0.0,0.294352


In [61]:
from IPython.core.display import display,HTML
from google_images_search import GoogleImagesSearch

gkey='AIzaSyCM5mlR6oXN6zCD0bk-PAi5hjz6NfpBfW4'
gcx = '96941285ce448aa86'

images = []
for i in dfIm2['name']:
    gis = GoogleImagesSearch(gkey, gcx)
    _search_params = {
        'q': i,
        'num': 1
    }
    gis.search(search_params=_search_params)
    images.append(gis.results()[0].url)
    
dfIm2 = dfIm2.reset_index()
dfIm2 = dfIm2.drop(columns='index')
# your images
colOrder = ['image']+list(dfIm2.columns)
dfIm2['image'] = images

# convert your links to html tags 
def path_to_image_html(path):
    return '<img src="'+ path + '" width="200" >'

pd.set_option('display.max_colwidth', None)
dfIm2 = dfIm2[colOrder]
#HTML(dfIm2.style.hide_index().set_properties(**{'font-size': '11pt','background-color': '#edeeef','border-color': 'black','border-style' :'solid' ,'border-width': '0px','border-collapse':'collapse'}).render())
dfIm2.to_html('dfIm.html', escape=False ,formatters=dict(image=path_to_image_html))

In [56]:
dfIm2

Unnamed: 0,image,name
0,https://www.volvocars.com/images/v/us/v/-/media/project/contentplatform/data/media/pdp/xc60-fuel/xc60-og.jpg?h=630&iar=0&w=1200,volvo xc60
1,https://upload.wikimedia.org/wikipedia/commons/5/5d/2018_BMW_X3_%28G01%29_xDrive30i_wagon_%282018-11-02%29_01.jpg,bmw x3
2,https://thumbor.forbes.com/thumbor/fit-in/960x720/filters:format(jpg)/https://www.forbes.com/wheels/wp-content/uploads/2020/09/2021-audi-q5-sportback-coupe-5.png,audi q5
3,"https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/2021-cadillac-xt5-mmp-1-1597436882.jpg?crop=0.796xw:0.599xh;0.0737xw,0.243xh&resize=1200:*",cadillac xt5


In [60]:
['image']+list(dfIm2.columns)

['image',
 'name',
 'reliability',
 'quality',
 'comfort',
 'design',
 'cost',
 'engine',
 'acceleration',
 'total']

In [59]:
dfIm2.columns

Index(['name', 'reliability', 'quality', 'comfort', 'design', 'cost', 'engine',
       'acceleration', 'total'],
      dtype='object')

In [62]:
df3

Unnamed: 0,name,keyword,aP,aPV,aPC,mean
0,Honda CB650R,comfort,0,0.0,0,0.0
1,Honda CB650R,engine,1,-0.296,3,1.234667
2,Honda CB650R,design,2,0.3524,4,2.117467
3,Honda CB650R,handling,0,0.0,0,0.0
4,Honda CB650R,comfort,3,0.8074,4,2.602467
5,Honda CB650R,engine,8,3.1004,13,8.033467
6,Honda CB650R,design,0,0.0,0,0.0
7,Honda CB650R,handling,3,0.8316,9,4.2772
8,Honda CB650R,comfort,0,0.0,0,0.0
9,Honda CB650R,engine,0,0.0,0,0.0
