In [1]:
from bs4 import BeautifulSoup as bs
import codecs
import json
from urllib.request import urlopen
from tqdm import tqdm
import re
import pandas as pd
from datetime import datetime

In [2]:
youtube_api_key = ""
melon_soup = bs(codecs.open("tnd.html", 'r').read(),'html.parser')
video_ids = [vid['href'].replace('/watch?v=','') for vid in melon_soup.findAll('a', {'class':'yt-simple-endpoint style-scope ytd-grid-video-renderer'}, href=True)]
melon_reviews_raw = [json.loads(urlopen(f"https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2CcontentDetails%2Cstatistics%2CtopicDetails%2Cid&id={id}&key={youtube_api_key}").read()) for id in tqdm(video_ids[:2000], desc='mining_reviews')]

mining_reviews: 100%|██████████| 2000/2000 [09:28<00:00,  3.52it/s]


In [3]:
melon_reviews = []
for rev in melon_reviews_raw:
    title = rev['items'][0]['snippet']['title']
    if 'album review' in title.lower():

        orig_desc = rev['items'][0]['snippet']['description']
        clean_desc = re.sub(r'http\S+', '<URL>', orig_desc).replace('==','').replace('\n',' ').lower()

        split_title = title.split('-')
        if len(split_title)>=2:
            artist,album = split_title[0].strip(),split_title[1].strip()
        else:
            artist,album = split_title[0],split_title[0]

        score = re.compile(r'[0-9]+\/10').search(orig_desc)
        if score:
            score = int(score.group(0).split('/')[0])
            if score>10 or score<0: score=None

        genres = [p for p in orig_desc.split('\n\n') if ' / ' in p][0].split(' / ')[-1].lower().split(', ')

        melon_reviews.append({
            'id':rev['items'][0]['id'],
            'artist':artist,
            'album':album.replace(' ALBUM REVIEW',''),
            'date':rev['items'][0]['snippet']['publishedAt'],
            'duration':rev['items'][0]['contentDetails']['duration'],
            'description':clean_desc.lower(),
            'thumbnail':rev['items'][0]['snippet']['thumbnails']['high']['url'],
            # 'tags':set(rev['items'][0]['snippet']['tags']),
            'views':rev['items'][0]['statistics']['viewCount'],
            'score':score,
            'genre':genres
        })

IndexError: list index out of range

In [None]:
import cv2
from tensorflow.keras.models import load_model
import tensorflow as tf
import numpy as np
from colorthief import ColorThief
from math import sqrt

class fashion_tools(object):
    def __init__(self,model,version=1.1):
        self.model   = model
        self.version = version

    def get_dress(self,file,stack):
        """limited to top wear and full body dresses (wild and studio working)"""
        """takes input rgb----> return PNG"""
        file = tf.image.resize_with_pad(file,target_height=512,target_width=512)
        rgb  = file.numpy()
        file = np.expand_dims(file,axis=0)/ 255.
        seq = self.model.predict(file)
        seq = seq[3][0,:,:,0]
        seq = np.expand_dims(seq,axis=-1)
        c1x = rgb*seq
        c2x = rgb*(1-seq)
        cfx = c1x+c2x
        dummy = np.ones((rgb.shape[0],rgb.shape[1],1))
        rgbx = np.concatenate((rgb,dummy*255),axis=-1)
        rgbs = np.concatenate((cfx,seq*255.),axis=-1)
        if stack:
            stacked = np.hstack((rgbx,rgbs))
            return stacked
        else:
            return rgbs

    def get_patch(self):
        return None

api    = fashion_tools(load_model("save_ckp_frozen.h5"))

In [None]:
def colorSim(rgb1, rgb2):
    r, g, b = rgb1
    cr, cg, cb = rgb2
    d=sqrt((r - cr)**2 + (g - cg)**2 + (b - cb)**2)
    return d/sqrt((255)^2+(255)^2+(255)^2)

def shirtDetect(rgb1,rgb2):
    types = {'yellow_check':((178, 161, 130),(55, 43, 48)),
             'red_check':((99, 23, 34), (209, 45, 72)),
             'blue_check':((56, 71, 120), (17, 7, 19)),
             'white_check':((190, 184, 181), (42, 28, 33))
             }
    pred = sorted([(k,(colorSim(rgb1,v[0])+colorSim(rgb2,v[1]))/2) for k,v in types.items()],key=lambda x:x[1])[0]
    if 'blue' in pred[0] and pred[1]<=3:
        return pred[0]
    elif pred[1]<=2:
        return pred[0]
    else:
        return 'other'

In [None]:
for rev in tqdm(melon_reviews, desc='shirt_prediction'):
    if rev['thumbnail']:
        image = np.asarray(bytearray(urlopen(rev['thumbnail']).read()),dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        isolated_img = api.get_dress(image,False)
        cv2.imwrite("out.png",isolated_img)
        try:
            rgb =  ColorThief("out.png").get_palette(color_count=2, quality=1)
        except:
            rgb = None
            continue
        rev['shirt'] =  shirtDetect(rgb[0],rgb[1])
    else:
        rev['shirt'] = None

In [None]:
melon_df = pd.DataFrame(melon_reviews)

In [None]:
from matplotlib import pyplot, dates
shirt_df = melon_df[['date','score','shirt']].dropna()
shirt_df['date2'] = dates.datestr2num(shirt_df['date'])
shirt_df['date'] = pd.to_datetime(shirt_df['date'])

shirt_df
# type(shirt_df['date'][0])

In [None]:
import seaborn as sns

g = sns.lmplot(
    data=shirt_df,
    x="date2", y="score", hue="shirt",
    height=5
)
# g