In [1]:
import urllib.request, json 
import time
import pandas as pd
from PIL import Image
import io
import random
import requests
import numpy as np
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from IPython.display import clear_output
# clear_output(wait=True)

# Project description
The project is devoted to finding out the correlation between poster colors and movie metrics.  
#### Steps:  
1) Find an open source for movie poster images  
2) Build a function for determining color distribution in the image or 'pallette'  
3) Build a dataset with metrics and pallettes for each movie  
4) Build a random forest regression model between certain metrics (such as popularity or box office) and defining feature importnace which will provide info on how certain color concentration in the poster affects metrics of the movie.  
5) It's also would be interesting to observe how pallette varies over time and genre.

In [2]:
def formMoviesDataFrame(number = 15, idList=[]):
    """
    The function returns a dataframe with a given number of movies
    """
    cont = input('ATTENTION! Previous backup dataframe will be overwritten. Continue? y/n \n')
    if cont.lower() != 'y':
        print('None is returned')
        return None
    else:
        print('Starting...')
        n = 0 #Number of movies added to a data frame
        n1=-1 #Added this because sometimes the function printed out the same n (when after n%100=0 there was exception)
        i = 0 #Current movie ID index from idList

        dct = defaultdict(list)

        t1 = time.time() #Need this to get mean time for adding movies 

        while n<number:
            #clear_output(wait=True) 
            u =f'https://api.themoviedb.org/3/movie/{idList[i]}?api_key=67fdef9e2b64f80963b786a34632effd'
            #time.sleep(0.1) #Initially thought it'd required but API works without it as well
            try:
                with urllib.request.urlopen(u) as url:
                    data = json.loads(url.read().decode())

                #Checking if poster exisits
                if data['poster_path'] is not None:
                    for k in data:
                        dct[k].append(data[k])
                    n+=1

                if n%100 == 0:
                    if n1!=n:
                        n1=n
                        t2 = time.time()
                        dur = ((t2-t1)/100)
                        #dur = '{0:.3g}'.format(dur)
                        print(f'{time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(time.time()))} \t {n} of {number} movies.\t Mean rate: {dur} movies/s')
                        t1 = t2

                        #As soon as it takes a long time to form a data frame I create a backup file every 100 iterations
                        backupMoviesDF = pd.DataFrame(dct)
                        backupMoviesDF.to_pickle('backupMoviesDF.pkl')
            except:
                pass
                #print(f'id {i} was skipped')
            i+=1
    return pd.DataFrame(dct)

In [None]:
def getImageFromUrlAsArray(url, draw = True):
    response = requests.get(url)
    image = io.BytesIO(response.content)
    image = Image.open(image)
    if draw:
        image.show()
    return np.asarray(image)

In [None]:
def drawSq(col):
    width = 1
    height = 1
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111, aspect='equal')
    ax1.add_patch(
        patches.Rectangle((0, 0), width, height, color=np.array(col)/255))
    plt.show()

def draw2Sq(col1,col2):
    width = 1
    height = 1
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111, aspect='equal')
    ax1.add_patch(
        patches.Rectangle((0, 0), width, height, color=np.array(col1)/255))
    ax1.add_patch(
        patches.Rectangle((1, 0), width, height, color=np.array(col2)/255))
    plt.xlim([0,2])
    plt.show()

def drawPalette(colList,freqList, barchart = True, log = False):
    if barchart:
        plt.bar([str(i) for i in colList], height=freqList, color=colList/255, edgecolor='black')
        plt.xticks(rotation=90)
        if log:
            plt.yscale('log')
        plt.title('Color Frequency Chart')
    else:
        width = 1
        fig1 = plt.figure()
        ax1 = fig1.add_subplot(111, aspect='equal')
        for i,c in enumerate(colList):
            height = freqList[i]
            ax1.add_patch(patches.Rectangle((i, 0), width, height, color=np.array(c)/255))

        plt.xlim([0,len(colList)])
        plt.title(f'Top {len(colList)} colors (descending)')
    
    plt.show()

def spec(N):
    """
    Taken from stackoverflow: 
    https://stackoverflow.com/questions/50980810/how-to-create-a-discrete-rgb-colourmap-with-n-colours-using-numpy
    
    """
    t = np.linspace(-510, 510, N)                                              
    return np.round(np.clip(np.stack([-t, 510-np.abs(t), t], axis=1), 0, 255))

#### Here I try a bruteforce approach with iterating over pixels

In [None]:
def findClosestColor(col, rgbRange = 32):
    clist = np.append(spec(rgbRange), [[0, 0, 0],[255,255,255]], axis=0)
    res = clist[np.argmin(list(map(lambda x: np.linalg.norm(np.array(col)-x), clist)))]
    return res

def convertColorsBruteIterating(originalImageArray, rgbRange = 32, draw = True):
    convertedImage = np.zeros(originalImageArray.shape)
    for i,r in enumerate(originalImageArray):
        for j,c in enumerate(r):
            convertedImage[i,j]=np.array(findClosestColor(c))
    if draw:
        Image.fromarray(np.hstack((np.array(originalImageArray),np.array(convertedImage.astype(np.uint8))))).show()
    
    unique, counts = np.unique(convertedImage.reshape(-1,convertedImage.shape[2]), 
                               return_counts=True, axis=0)
    topColors = dict(zip(list(map(tuple, unique)), counts))
    topColors = Counter(topColors)
    
    colorList = np.empty((0,3))
    freqList = np.empty(0)
    for k in topColors.most_common(rgbRange+2):
        colorList = np.append(colorList, np.array(k[0]).reshape(1,3), axis=0)
        freqList = np.append(freqList, k[1])
        
    return colorList, freqList

In [None]:
%%timeit
clrs = convertColorsBruteIterating(getImageFromUrlAsArray('https://image.tmdb.org/t/p/w500'+'/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg',
                                    draw = False),
             rgbRange=12)

In [None]:
drawPalette(clrs[0],clrs[1])

##### Conclusion: 
beside the fact that my notebook sounded like a jet liner at the take off during the execution, the time spent on it is really discouraging. Therefore I went for the vectorization. 

#### Here I try to achieve the same result by means of vector operations to increase the efficiency

In [None]:
def convertColors(originalImageArray, rgbRange = 32, draw = True, top = True):
    clist = np.append([[0, 0, 0],[255,255,255]],spec(rgbRange), axis=0) #added black and white in the beginning
    im_data_stretch = originalImageArray.reshape(-1,originalImageArray.shape[2]) #converting to a 2d array with shape (n_pixels, 3)
    clistE = np.broadcast_to(clist, (im_data_stretch.shape[0], rgbRange+2, 3)) #broadcasting clist for operations with im_data_stretch
    im_data_stretch = im_data_stretch.reshape(int(originalImageArray.shape[0]*originalImageArray.shape[1]),1,3)
    #matrix with indices of colors in clist with closest colors
    inpArray = np.array(list(map(np.argmin,(np.sqrt(np.sum((clistE - im_data_stretch) ** 2, axis = 2)))))).reshape(originalImageArray.shape[0:2])
    
    #converting it to RGB matrix for image formation
    mapDct = defaultdict(list)
    for i,c in enumerate(clist):
        mapDct[i]=c
    
    #take from: https://stackoverflow.com/questions/55949809/efficiently-replace-elements-in-array-based-on-dictionary-numpy-python
    k = np.array(list(mapDct.keys()))
    v = np.array(list(mapDct.values()))
    mapping_ar = np.zeros((k.max()+1,3),dtype=v.dtype)
    mapping_ar[k] = v
    convertedImageArray = mapping_ar[inpArray]
    
    if draw:
        convertedImage = Image.fromarray(convertedImageArray.astype(np.uint8), 'RGB')
        Image.fromarray(np.hstack((np.array(originalImageArray),np.array(convertedImage)))).show() #side by side
    
    unique, counts = np.unique(inpArray, return_counts=True)
    topColors = dict(zip(unique, counts))
    topColors = Counter(topColors)
    topColors.most_common(rgbRange+2)
    
    colorList = np.empty((0,3))
    freqList = np.empty(0)
    if top:
        for k in topColors.most_common(rgbRange+2):
            colorList = np.append(colorList, mapDct[k[0]].reshape(1,3), axis=0)
            freqList = np.append(freqList, k[1])
    else:
        for k in mapDct:
            colorList = np.append(colorList, mapDct[k].reshape(1,3), axis=0)
            freqList = np.append(freqList, topColors[k]/int(originalImageArray.shape[0]*originalImageArray.shape[1]))
    return colorList, freqList

In [None]:

clrs = convertColors(getImageFromUrlAsArray('https://image.tmdb.org/t/p/w500'+'/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg',
                                    draw = False),
             rgbRange=12, top = False)

In [None]:
clrs

In [None]:
drawPalette(clrs[0],clrs[1], log = True)

#### Creating columns with color data of each poster

In [3]:
#My first approach was to just iterate through all ids and get info on existing ones but it took too long. 
#So I got a list of valid IDs which I will iterate through
movieIDs = []
with open('movie_ids_12_09_2020.json', mode='r') as jsonfile:
    for r,i in enumerate(jsonfile):
        movieID = json.loads(i)
        movieIDs.append(movieID['id']) 
random.shuffle(movieIDs) #because they were oredered
len(movieIDs)


565427

In [4]:
df = formMoviesDataFrame(100000,movieIDs)

ATTENTION! Previous backup dataframe will be overwritten. Continue? y/n 
y
Starting...
Sun, 07 Mar 2021 21:56:21 	 100 of 100000 movies.	 Mean rate: 0.5267125415802002 movies/s
Sun, 07 Mar 2021 21:57:18 	 200 of 100000 movies.	 Mean rate: 0.5730008387565613 movies/s
Sun, 07 Mar 2021 21:58:16 	 300 of 100000 movies.	 Mean rate: 0.5747111296653747 movies/s
Sun, 07 Mar 2021 21:59:08 	 400 of 100000 movies.	 Mean rate: 0.5224041604995727 movies/s
Sun, 07 Mar 2021 22:00:07 	 500 of 100000 movies.	 Mean rate: 0.5902126502990722 movies/s
Sun, 07 Mar 2021 22:01:04 	 600 of 100000 movies.	 Mean rate: 0.5677782201766968 movies/s
Sun, 07 Mar 2021 22:01:59 	 700 of 100000 movies.	 Mean rate: 0.5509559774398803 movies/s
Sun, 07 Mar 2021 22:02:59 	 800 of 100000 movies.	 Mean rate: 0.6039493608474732 movies/s
Sun, 07 Mar 2021 22:03:53 	 900 of 100000 movies.	 Mean rate: 0.537384340763092 movies/s
Sun, 07 Mar 2021 22:04:43 	 1000 of 100000 movies.	 Mean rate: 0.49593047142028807 movies/s
Sun, 07 Mar 

Sun, 07 Mar 2021 23:24:59 	 9100 of 100000 movies.	 Mean rate: 0.580925612449646 movies/s
Sun, 07 Mar 2021 23:25:54 	 9200 of 100000 movies.	 Mean rate: 0.5517202472686767 movies/s
Sun, 07 Mar 2021 23:27:03 	 9300 of 100000 movies.	 Mean rate: 0.6891401600837708 movies/s
Sun, 07 Mar 2021 23:28:04 	 9400 of 100000 movies.	 Mean rate: 0.6094895935058594 movies/s
Sun, 07 Mar 2021 23:29:06 	 9500 of 100000 movies.	 Mean rate: 0.6209962487220764 movies/s
Sun, 07 Mar 2021 23:30:07 	 9600 of 100000 movies.	 Mean rate: 0.6087624788284302 movies/s
Sun, 07 Mar 2021 23:31:01 	 9700 of 100000 movies.	 Mean rate: 0.5360727691650391 movies/s
Sun, 07 Mar 2021 23:31:56 	 9800 of 100000 movies.	 Mean rate: 0.5509601402282714 movies/s
Sun, 07 Mar 2021 23:32:52 	 9900 of 100000 movies.	 Mean rate: 0.5613738203048706 movies/s
Sun, 07 Mar 2021 23:33:50 	 10000 of 100000 movies.	 Mean rate: 0.5866435813903809 movies/s
Sun, 07 Mar 2021 23:34:44 	 10100 of 100000 movies.	 Mean rate: 0.5406339693069458 movies/

Mon, 08 Mar 2021 00:51:49 	 18100 of 100000 movies.	 Mean rate: 0.5605201411247254 movies/s
Mon, 08 Mar 2021 00:52:52 	 18200 of 100000 movies.	 Mean rate: 0.6344971799850464 movies/s
Mon, 08 Mar 2021 00:53:47 	 18300 of 100000 movies.	 Mean rate: 0.5460023689270019 movies/s
Mon, 08 Mar 2021 00:54:41 	 18400 of 100000 movies.	 Mean rate: 0.544590220451355 movies/s
Mon, 08 Mar 2021 00:55:34 	 18500 of 100000 movies.	 Mean rate: 0.53104327917099 movies/s
Mon, 08 Mar 2021 00:56:30 	 18600 of 100000 movies.	 Mean rate: 0.5592932295799256 movies/s
Mon, 08 Mar 2021 00:57:24 	 18700 of 100000 movies.	 Mean rate: 0.5341511011123657 movies/s
Mon, 08 Mar 2021 00:58:16 	 18800 of 100000 movies.	 Mean rate: 0.5252727389335632 movies/s
Mon, 08 Mar 2021 00:59:16 	 18900 of 100000 movies.	 Mean rate: 0.5923799109458924 movies/s
Mon, 08 Mar 2021 01:00:01 	 19000 of 100000 movies.	 Mean rate: 0.45772172927856447 movies/s
Mon, 08 Mar 2021 01:00:52 	 19100 of 100000 movies.	 Mean rate: 0.5042251706123352

Mon, 08 Mar 2021 02:18:06 	 27100 of 100000 movies.	 Mean rate: 0.5323902320861816 movies/s
Mon, 08 Mar 2021 02:18:59 	 27200 of 100000 movies.	 Mean rate: 0.53499764919281 movies/s
Mon, 08 Mar 2021 02:19:53 	 27300 of 100000 movies.	 Mean rate: 0.5380060076713562 movies/s
Mon, 08 Mar 2021 02:20:47 	 27400 of 100000 movies.	 Mean rate: 0.5388406729698181 movies/s
Mon, 08 Mar 2021 02:21:41 	 27500 of 100000 movies.	 Mean rate: 0.5438534188270568 movies/s
Mon, 08 Mar 2021 02:22:38 	 27600 of 100000 movies.	 Mean rate: 0.5695740103721618 movies/s
Mon, 08 Mar 2021 02:23:34 	 27700 of 100000 movies.	 Mean rate: 0.5567829823493957 movies/s
Mon, 08 Mar 2021 02:24:28 	 27800 of 100000 movies.	 Mean rate: 0.5413618564605713 movies/s
Mon, 08 Mar 2021 02:25:23 	 27900 of 100000 movies.	 Mean rate: 0.5428965830802918 movies/s
Mon, 08 Mar 2021 02:26:15 	 28000 of 100000 movies.	 Mean rate: 0.5226848983764648 movies/s
Mon, 08 Mar 2021 02:27:06 	 28100 of 100000 movies.	 Mean rate: 0.5075862097740174

Mon, 08 Mar 2021 03:40:50 	 36100 of 100000 movies.	 Mean rate: 0.4999851632118225 movies/s
Mon, 08 Mar 2021 03:41:46 	 36200 of 100000 movies.	 Mean rate: 0.5534702587127686 movies/s
Mon, 08 Mar 2021 03:42:38 	 36300 of 100000 movies.	 Mean rate: 0.5234544992446899 movies/s
Mon, 08 Mar 2021 03:43:30 	 36400 of 100000 movies.	 Mean rate: 0.5186148691177368 movies/s
Mon, 08 Mar 2021 03:44:25 	 36500 of 100000 movies.	 Mean rate: 0.5482040905952453 movies/s
Mon, 08 Mar 2021 03:45:25 	 36600 of 100000 movies.	 Mean rate: 0.6056650614738465 movies/s
Mon, 08 Mar 2021 03:46:18 	 36700 of 100000 movies.	 Mean rate: 0.5268326497077942 movies/s
Mon, 08 Mar 2021 03:47:08 	 36800 of 100000 movies.	 Mean rate: 0.5017489981651306 movies/s
Mon, 08 Mar 2021 03:48:01 	 36900 of 100000 movies.	 Mean rate: 0.5234108424186706 movies/s
Mon, 08 Mar 2021 03:48:53 	 37000 of 100000 movies.	 Mean rate: 0.5250655174255371 movies/s
Mon, 08 Mar 2021 03:49:54 	 37100 of 100000 movies.	 Mean rate: 0.60477051258087

Mon, 08 Mar 2021 05:03:34 	 45100 of 100000 movies.	 Mean rate: 0.501635959148407 movies/s
Mon, 08 Mar 2021 05:04:27 	 45200 of 100000 movies.	 Mean rate: 0.5332727694511413 movies/s
Mon, 08 Mar 2021 05:05:22 	 45300 of 100000 movies.	 Mean rate: 0.5487330985069275 movies/s
Mon, 08 Mar 2021 05:06:17 	 45400 of 100000 movies.	 Mean rate: 0.5482941818237305 movies/s
Mon, 08 Mar 2021 05:07:12 	 45500 of 100000 movies.	 Mean rate: 0.5584123611450196 movies/s
Mon, 08 Mar 2021 05:08:02 	 45600 of 100000 movies.	 Mean rate: 0.49329782009124756 movies/s
Mon, 08 Mar 2021 05:08:55 	 45700 of 100000 movies.	 Mean rate: 0.5302093601226807 movies/s
Mon, 08 Mar 2021 05:09:50 	 45800 of 100000 movies.	 Mean rate: 0.5486662769317627 movies/s
Mon, 08 Mar 2021 05:10:39 	 45900 of 100000 movies.	 Mean rate: 0.4949071216583252 movies/s
Mon, 08 Mar 2021 05:11:29 	 46000 of 100000 movies.	 Mean rate: 0.502852029800415 movies/s
Mon, 08 Mar 2021 05:12:22 	 46100 of 100000 movies.	 Mean rate: 0.525804579257965

Mon, 08 Mar 2021 06:24:00 	 54100 of 100000 movies.	 Mean rate: 0.557679648399353 movies/s
Mon, 08 Mar 2021 06:24:54 	 54200 of 100000 movies.	 Mean rate: 0.5363574004173279 movies/s
Mon, 08 Mar 2021 06:25:46 	 54300 of 100000 movies.	 Mean rate: 0.5180382204055786 movies/s
Mon, 08 Mar 2021 06:26:37 	 54400 of 100000 movies.	 Mean rate: 0.5157427120208741 movies/s
Mon, 08 Mar 2021 06:27:29 	 54500 of 100000 movies.	 Mean rate: 0.5175682997703552 movies/s
Mon, 08 Mar 2021 06:28:24 	 54600 of 100000 movies.	 Mean rate: 0.5535938787460327 movies/s
Mon, 08 Mar 2021 06:29:19 	 54700 of 100000 movies.	 Mean rate: 0.5453810977935791 movies/s
Mon, 08 Mar 2021 06:30:13 	 54800 of 100000 movies.	 Mean rate: 0.5368404197692871 movies/s
Mon, 08 Mar 2021 06:31:05 	 54900 of 100000 movies.	 Mean rate: 0.5224650621414184 movies/s
Mon, 08 Mar 2021 06:32:00 	 55000 of 100000 movies.	 Mean rate: 0.5504302406311035 movies/s
Mon, 08 Mar 2021 06:32:57 	 55100 of 100000 movies.	 Mean rate: 0.567999038696289

Mon, 08 Mar 2021 07:46:34 	 63100 of 100000 movies.	 Mean rate: 0.5812328696250916 movies/s
Mon, 08 Mar 2021 07:47:26 	 63200 of 100000 movies.	 Mean rate: 0.528765070438385 movies/s
Mon, 08 Mar 2021 07:48:19 	 63300 of 100000 movies.	 Mean rate: 0.5261749100685119 movies/s
Mon, 08 Mar 2021 07:49:16 	 63400 of 100000 movies.	 Mean rate: 0.567022578716278 movies/s
Mon, 08 Mar 2021 07:50:09 	 63500 of 100000 movies.	 Mean rate: 0.5348074316978455 movies/s
Mon, 08 Mar 2021 07:51:01 	 63600 of 100000 movies.	 Mean rate: 0.5172892379760742 movies/s
Mon, 08 Mar 2021 07:51:56 	 63700 of 100000 movies.	 Mean rate: 0.5518047213554382 movies/s
Mon, 08 Mar 2021 07:52:46 	 63800 of 100000 movies.	 Mean rate: 0.4965796995162964 movies/s
Mon, 08 Mar 2021 07:53:42 	 63900 of 100000 movies.	 Mean rate: 0.5638781118392945 movies/s
Mon, 08 Mar 2021 07:54:35 	 64000 of 100000 movies.	 Mean rate: 0.525159809589386 movies/s
Mon, 08 Mar 2021 07:55:28 	 64100 of 100000 movies.	 Mean rate: 0.5300112986564636 

Mon, 08 Mar 2021 09:10:32 	 72100 of 100000 movies.	 Mean rate: 0.5465254926681519 movies/s
Mon, 08 Mar 2021 09:11:28 	 72200 of 100000 movies.	 Mean rate: 0.5538169074058533 movies/s
Mon, 08 Mar 2021 09:12:24 	 72300 of 100000 movies.	 Mean rate: 0.5598899507522583 movies/s
Mon, 08 Mar 2021 09:13:24 	 72400 of 100000 movies.	 Mean rate: 0.6050969505310059 movies/s
Mon, 08 Mar 2021 09:14:22 	 72500 of 100000 movies.	 Mean rate: 0.5766208291053772 movies/s
Mon, 08 Mar 2021 09:15:25 	 72600 of 100000 movies.	 Mean rate: 0.6267138504981995 movies/s
Mon, 08 Mar 2021 09:16:22 	 72700 of 100000 movies.	 Mean rate: 0.5778709006309509 movies/s
Mon, 08 Mar 2021 09:17:23 	 72800 of 100000 movies.	 Mean rate: 0.6090409183502197 movies/s
Mon, 08 Mar 2021 09:18:15 	 72900 of 100000 movies.	 Mean rate: 0.5207659697532654 movies/s
Mon, 08 Mar 2021 09:19:10 	 73000 of 100000 movies.	 Mean rate: 0.5460250496864318 movies/s
Mon, 08 Mar 2021 09:20:07 	 73100 of 100000 movies.	 Mean rate: 0.56895758152008

Mon, 08 Mar 2021 10:36:27 	 81100 of 100000 movies.	 Mean rate: 0.5615082788467407 movies/s
Mon, 08 Mar 2021 10:37:24 	 81200 of 100000 movies.	 Mean rate: 0.5674645495414734 movies/s
Mon, 08 Mar 2021 10:38:18 	 81300 of 100000 movies.	 Mean rate: 0.5414919233322144 movies/s
Mon, 08 Mar 2021 10:39:20 	 81400 of 100000 movies.	 Mean rate: 0.6139683675765991 movies/s
Mon, 08 Mar 2021 10:40:12 	 81500 of 100000 movies.	 Mean rate: 0.527199878692627 movies/s
Mon, 08 Mar 2021 10:41:10 	 81600 of 100000 movies.	 Mean rate: 0.5797043204307556 movies/s
Mon, 08 Mar 2021 10:42:05 	 81700 of 100000 movies.	 Mean rate: 0.5422301816940308 movies/s
Mon, 08 Mar 2021 10:43:02 	 81800 of 100000 movies.	 Mean rate: 0.5739955091476441 movies/s
Mon, 08 Mar 2021 10:44:01 	 81900 of 100000 movies.	 Mean rate: 0.588073799610138 movies/s
Mon, 08 Mar 2021 10:44:58 	 82000 of 100000 movies.	 Mean rate: 0.575215892791748 movies/s
Mon, 08 Mar 2021 10:45:55 	 82100 of 100000 movies.	 Mean rate: 0.5675356388092041 

Mon, 08 Mar 2021 12:03:39 	 90100 of 100000 movies.	 Mean rate: 0.5670547199249267 movies/s
Mon, 08 Mar 2021 12:04:39 	 90200 of 100000 movies.	 Mean rate: 0.5984208798408508 movies/s
Mon, 08 Mar 2021 12:05:37 	 90300 of 100000 movies.	 Mean rate: 0.5791932392120361 movies/s
Mon, 08 Mar 2021 12:06:36 	 90400 of 100000 movies.	 Mean rate: 0.5891470408439636 movies/s
Mon, 08 Mar 2021 12:07:42 	 90500 of 100000 movies.	 Mean rate: 0.660291669368744 movies/s
Mon, 08 Mar 2021 12:08:38 	 90600 of 100000 movies.	 Mean rate: 0.563819580078125 movies/s
Mon, 08 Mar 2021 12:09:42 	 90700 of 100000 movies.	 Mean rate: 0.6336286282539367 movies/s
Mon, 08 Mar 2021 12:10:45 	 90800 of 100000 movies.	 Mean rate: 0.6344290018081665 movies/s
Mon, 08 Mar 2021 12:11:50 	 90900 of 100000 movies.	 Mean rate: 0.650306429862976 movies/s
Mon, 08 Mar 2021 12:12:54 	 91000 of 100000 movies.	 Mean rate: 0.6391475582122803 movies/s
Mon, 08 Mar 2021 12:13:46 	 91100 of 100000 movies.	 Mean rate: 0.5209438538551331 

Mon, 08 Mar 2021 13:32:30 	 99100 of 100000 movies.	 Mean rate: 0.5507662415504455 movies/s
Mon, 08 Mar 2021 13:33:31 	 99200 of 100000 movies.	 Mean rate: 0.61700847864151 movies/s
Mon, 08 Mar 2021 13:34:29 	 99300 of 100000 movies.	 Mean rate: 0.5739913296699524 movies/s
Mon, 08 Mar 2021 13:35:24 	 99400 of 100000 movies.	 Mean rate: 0.5485168814659118 movies/s
Mon, 08 Mar 2021 13:36:27 	 99500 of 100000 movies.	 Mean rate: 0.6371235704421997 movies/s
Mon, 08 Mar 2021 13:37:22 	 99600 of 100000 movies.	 Mean rate: 0.5494974303245544 movies/s
Mon, 08 Mar 2021 13:38:22 	 99700 of 100000 movies.	 Mean rate: 0.6003619480133057 movies/s
Mon, 08 Mar 2021 13:39:20 	 99800 of 100000 movies.	 Mean rate: 0.5744800519943237 movies/s
Mon, 08 Mar 2021 13:40:17 	 99900 of 100000 movies.	 Mean rate: 0.5766168689727783 movies/s
Mon, 08 Mar 2021 13:41:19 	 100000 of 100000 movies.	 Mean rate: 0.613615620136261 movies/s


In [None]:
backupDct

In [None]:
df.tail()

In [None]:
df.to_pickle('dfMoves.pkl')

Loading previously saved dataset of movies.

In [None]:
df = pd.read_pickle('dfMoves.pkl')

I decided not to save all posters in the dataframe as it'd obviosly be an enormous dataset hard to work with. Therefore I just add color distributions by applying convertColors function.

In [None]:
clrsList = []
for i,posterUrl in enumerate(df['poster_path']):
    clear_output(wait=True)
    print(f"Current url: {posterUrl}. {i+1} from {len(df['poster_path'])}")
    try:
        if i%250 == 0:
            clrs = convertColors(getImageFromUrlAsArray('https://image.tmdb.org/t/p/w500'+posterUrl,
                                            draw = False),
                     rgbRange=10, top=False, draw = True)
        else:
            clrs = convertColors(getImageFromUrlAsArray('https://image.tmdb.org/t/p/w500'+posterUrl,
                                            draw = False),
                     rgbRange=10, top=False, draw = False)
    except:
        clrs = np.nan
    clrsList.append(clrs)

In [None]:
clrsList[20]

In [None]:
df['colorData'] = clrsList
df['colorData'],df['colorFreq'] = df['colorData'].apply(pd.Series)[0],df['colorData'].apply(pd.Series)[1]
df

In [None]:
df.to_pickle('moviesDfColors.pkl')

In [None]:
colorNames = df['colorData'][0]
colorNames = list(map(tuple,colorNames))
colorNames

In [None]:
dfColors = df['colorFreq'].apply(pd.Series)
dfColors.columns = colorNames
dfColors

In [None]:
df = pd.concat([df, dfColors], axis=1)

In [None]:
df2 = df.drop(columns=['colorData','colorFreq','poster_path'])
df2.dropna(inplace=True)

In [None]:
df2

In [None]:
sns.heatmap(df2.corr())

In [None]:
sns.set(rc={'figure.figsize':(16,8)})
ax = sns.barplot(x = df2.corr()['popularity'][2:].index, 
            y = df2.corr()['popularity'][2:].values,
           palette=np.array(colorNames)/255)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X = df2.iloc[:,3:]
y = df2['popularity']

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(rfr, X_train,y_train, cv = 5)

In [None]:
importantColors = []
for col, imp in zip(colorNames,rfr.feature_importances_):
    if imp>np.mean(rfr.feature_importances_):
        print(col,'\t:\t',imp)
        importantColors.append(col)

In [None]:
y_pred = rfr.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared = False)

In [None]:
df2.describe()