In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# !unzip '/content/gdrive/My Drive/Movies/posters.zip'
!unzip '/content/gdrive/My Drive/Movies/Earning Prediction/Dataset.zip'

In [2]:
import requests
import time
import re
import os
import pandas as pd
from PIL import Image
import torchvision
from skimage.transform import resize
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import copy
import shutil
from urllib.request import urlopen
import matplotlib.pyplot as plt



In [12]:
dfwithearnings = pd.read_csv("./gdrive/My Drive/Movies/movies_metadata.csv",encoding='ISO-8859-1')
dfwithoutearnings = pd.read_csv("./gdrive/My Drive/Movies/MovieGenre.csv",encoding='ISO-8859-1')

In [13]:
dfwithearnings = dfwithearnings[dfwithearnings.poster_path.notnull()]
dfwithearnings = dfwithearnings[dfwithearnings.budget != 0]
dfwithearnings = dfwithearnings[dfwithearnings.revenue != 0]
dfwithearnings = dfwithearnings[dfwithearnings.genres.notnull()]
dfwithearnings = dfwithearnings[dfwithearnings.original_title.notnull()]
dfwithearnings = dfwithearnings[dfwithearnings.title.notnull()]
dfwithearnings = dfwithearnings[dfwithearnings.release_date.notnull()]

dfwithearnings['year'] = dfwithearnings['release_date'].str[:4]


dfwithearnings['Title'] = dfwithearnings['title'] + " (" + dfwithearnings['year'] + ")"

dffinal = pd.merge(dfwithearnings, dfwithoutearnings, on="Title")
dffinal = dffinal[['Title', 'budget', 'revenue', 'Genre', 'Poster']]
dffinal['budget'] = pd.to_numeric(dffinal['budget'])
dffinal = dffinal.loc[dffinal['budget']!= 0]
dffinal = pd.DataFrame.reset_index(dffinal)
dffinal = dffinal.drop(['index'], axis = 1)

dffinal['revtobud'] = dffinal['revenue']/dffinal['budget']
dffinal['pe_range'] = np.where(dffinal['revtobud']>=1, 'high-earnings', 'low-earnings')
print(dffinal.shape)

(4849, 7)


In [14]:
directory = 'Dataset/samplemovieposters'
directory2 = 'Dataset/samplemoviepostersprocessed'


print(dffinal.shape)
for i in range(dffinal.shape[0]):
    filename = str(i) + ".jpg"
    address = os.path.join(directory, filename)
    image = Image.open(address)
    image = image.resize((224, 224), Image.BILINEAR)
    if dffinal['pe_range'][i] == "high-earnings":
        newaddress = directory2 + "/high"
        if not os.path.isdir(newaddress):
            os.makedirs(newaddress)
        newaddress = os.path.join(newaddress, filename)
        
        image.save(newaddress)
    else:
        newaddress = directory2 + "/low"
        if not os.path.isdir(newaddress):
            os.makedirs(newaddress)
        newaddress = os.path.join(newaddress, filename)
        image.save(newaddress)
    

(4849, 7)


In [15]:
do_transforms = torchvision.transforms.Compose([
        torchvision.transforms.RandomCrop((280,280), padding=None, pad_if_needed=True, fill=0, padding_mode='constant'),
        torchvision.transforms.Resize((224), interpolation=2),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
])
directory3 = 'Dataset/samplemoviepostersprocessed'
dataset = torchvision.datasets.ImageFolder(directory3, transform=do_transforms)

In [16]:
validation_split = .2
shuffle_dataset = True
batch_size = 64

dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

if shuffle_dataset:
    np.random.seed(0)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

dataloaders_dict = {
    'train': torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                         sampler=train_sampler),
    'val': torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                       sampler=valid_sampler)
}


In [31]:

num_classes = len(dataset.classes)
model = torchvision.models.resnet18(pretrained=True)

num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, num_classes)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [38]:
class Trainer(object):
    def __init__(self, model, learning_rate=0.01, early_stopping = 5, patience = 0.005):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.model = model.to(self.device)

        params_to_update = self.get_parameters()

        self.optimizer = torch.optim.Adam(params_to_update, lr=learning_rate)

        self.criterion = torch.nn.CrossEntropyLoss()

        self.early_stopping = early_stopping

        self.patience = patience

    def _loop(self, data_loader):
        running_loss = 0.0
        running_corrects = 0
        total_data_count = 0

        for X, Y in data_loader:
            inputs = X.to(self.device)
            labels = Y.to(self.device)

            self.optimizer.zero_grad()

            outputs = self.model(inputs)
            running_corrects += torch.sum(self.computes_accuracy(outputs, labels.data))

            loss = self.criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)

            total_data_count += len(X)

            loss.backward()
            self.optimizer.step()

        epoch_loss = running_loss / total_data_count
        epoch_acc = running_corrects.double() / total_data_count

        return epoch_loss, epoch_acc

    def computes_accuracy(self, outputs, targets):
        _, preds = outputs.topk(1, 1, True, True)
        preds = preds.t()
        correct = preds.eq(targets.view(1, -1).expand_as(preds))
        correct_k = correct[:1].view(-1).float()
        return correct_k

    def train(self, data_loader):
        self.model.train()
        return self._loop(data_loader)

    def evaluate(self, data_loader):
        running_loss = 0.0
        running_corrects = 0
        total_data_count = 0
        for X, Y in data_loader:
            inputs = X.to(self.device)
            labels = Y.to(self.device)

            outputs = self.model(inputs)
            running_corrects += torch.sum(self.computes_accuracy(outputs, labels.data))

            loss = self.criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)

            total_data_count += len(X)
        
        eval_loss = running_loss / total_data_count
        eval_acc = running_corrects.double() / total_data_count

        return eval_loss, eval_acc

    def fit(self, dataloaders_dict, num_epochs=10):
        print("Starting training...")

        early_stopping_counter = self.early_stopping
        time_fit_start = time.time()
        train_losses, test_losses, train_accuracies, test_accuracies = [], [], [], []

        best_epoch_info = {
            'model_wts':copy.deepcopy(self.model.state_dict()),
            'loss':1e10
        }

        for epoch in range(num_epochs):
            time_epoch_start = time.time()

            train_loss, train_acc = self.train(dataloaders_dict['train'])
            val_loss, val_acc = self.evaluate(dataloaders_dict['val'])
            
            train_losses.append(train_loss)
            test_losses.append(val_loss)
            train_accuracies.append(train_acc)
            test_accuracies.append(val_acc)

            current_learning_rate = self.optimizer.param_groups[0]['lr']

            print("Epoch {:2} in {:.0f}s || Train loss={:.3f}, acc={:.3f} | Val loss={:.3f}, acc={:.3f} | LR={}".format(epoch+1, time.time() - time_epoch_start, train_loss, train_acc, val_loss, val_acc, current_learning_rate))

            if val_loss < best_epoch_info['loss']:
                best_epoch_info = {
                    'model_wts':copy.deepcopy(self.model.state_dict()),
                    'loss':val_loss,
                    'epoch':epoch,
                    'metrics':{
                        'train_loss':train_loss,
                        'val_loss':val_loss,
                        'train_acc':train_acc,
                        'val_acc':val_acc
                    }
                }

            if len(test_losses) > 1:
                if val_loss-test_losses[len(test_losses)-2] >= -1*self.patience:
                    early_stopping_counter -= 1
                else:
                    early_stopping_counter = self.early_stopping

            if early_stopping_counter == 0:
                print("Early Stop")
                break

        time_elapsed = time.time() - time_fit_start
        print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        self.model.load_state_dict(best_epoch_info['model_wts'])

        print('Loaded best epoch : ', best_epoch_info['epoch'])
        return train_losses, test_losses

    def get_parameters(self):
        print("Layers with params to learn:")
        params_to_update = []
        for name, param in self.model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)

        print('\t', len(params_to_update), 'layers')
        return params_to_update




In [39]:
torch.manual_seed(1000)
trainer = Trainer(model)

trainloss, testloss = trainer.fit(dataloaders_dict, num_epochs=20)

Layers with params to learn:
	 62 layers
Starting training...
Epoch  1 in 21s || Train loss=0.604, acc=0.715 | Val loss=0.603, acc=0.710 | LR=0.01
Epoch  2 in 21s || Train loss=0.601, acc=0.715 | Val loss=0.601, acc=0.711 | LR=0.01
Epoch  3 in 21s || Train loss=0.603, acc=0.713 | Val loss=0.607, acc=0.711 | LR=0.01
Epoch  4 in 24s || Train loss=0.602, acc=0.715 | Val loss=0.601, acc=0.711 | LR=0.01
Epoch  5 in 21s || Train loss=0.599, acc=0.715 | Val loss=0.601, acc=0.711 | LR=0.01
Epoch  6 in 21s || Train loss=0.599, acc=0.715 | Val loss=0.605, acc=0.711 | LR=0.01
Epoch  7 in 21s || Train loss=0.600, acc=0.715 | Val loss=0.601, acc=0.711 | LR=0.01
Epoch  8 in 21s || Train loss=0.598, acc=0.715 | Val loss=0.601, acc=0.711 | LR=0.01
Epoch  9 in 22s || Train loss=0.598, acc=0.715 | Val loss=0.602, acc=0.711 | LR=0.01
Early Stop
Training complete in 3m 13s
Loaded best epoch :  1


In [40]:
running_loss = 0.0
running_corrects = 0
total_data_count = 0

def computes_accuracy(outputs, targets, k=1):
        _, preds = outputs.topk(k, 1, True, True)
        preds = preds.t()
        correct = preds.eq(targets.view(1, -1).expand_as(preds))
        correct_k = correct[:k].view(-1).float()
        return correct_k

for X, Y in dataloaders_dict['val']:
            inputs = X.to(device)
            labels = Y.to(device)
            inputs, labels = inputs.cuda(), labels.cuda() 

            outputs = model(inputs)
            acc = torch.sum(computes_accuracy(outputs, labels.data, 1))
            running_corrects += acc
            total_data_count += len(X)
            
ValidationAcc = running_corrects.double() / total_data_count
print("Validation Accuracy : {:.3f}".format(ValidationAcc))

Validation Accuracy : 0.711


In [41]:
print(model)


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
plt.plot(trainloss)
plt.plot(testloss)
plt.legend(['Training Loss','Validation Loss'])
plt.xticks([2,4,6,8,10,12,14,16,18])
plt.xlabel("epochs")
plt.show()

In [23]:
# count = 0
# i = 0
# for link in dffinal['Poster'].iteritems():
#   if(i < 4840):
#     i+=1
#     continue
#   if(link[1] != 'nan'):
#     save_as = "samplemovieposters/" + str(i) + ".jpg"
#     url = link[1]
#     try:
#         file = urlopen(url)
#         content = file.read()
#         count+=1
#         print(count)
#     except:
#         continue
#     # Save to file
#     with open(save_as, 'wb') as download:
#         download.write(content)
#     i+=1
# print(count)