In [None]:
!pip install scikit-multilearn

In [None]:
import torch
from typing import Tuple, Dict
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
import pandas as pd
import csv
import json
from matplotlib import pyplot as plt
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [None]:
pd.set_option('display.max_colwidth', 300)

In [None]:
# class MultiLabelClassifier:

#     def __init__(self, X, Y, layer_dims, n_iterations = 100, learning_rate = 0.1, print_cost = False, draw_cost = False):
#         self.X = X
#         self.Y = Y
#         self.layer_dims = layer_dims
#         self.n_iterations = n_iterations
#         self.learning_rate = learning_rate
#         self.print_cost = print_cost
#         self.draw_cost = draw_cost
#         self.n_layers = len(layer_dims)
#         self.params = {}
#         self.grads = {}
    
#     def initialize_params(self) -> Dict:
#         ''' 
#         Initiazlize the NN according to layer dimensions

#         Returns
#         -------
#         parameters: ``Dict``
#             Dictionary containing values for weights and biases for all connections
#         '''
#         for l in range(1, self.n_layers):
#             self.params[f'w{l}'] = torch.rand((self.layer_dims[l], self.layer_dims[l-1])) * 0.01
#             self.params[f'b{l}'] = torch.zeros(self.layer_dims[l])
       

#     def forward_prop(self, AL):
#         '''
#         Carry out forward propagation for one metaing sample
#         '''

#         w = self.param['w']
#         z = torch.mm(AL, w) + b
#         return z
         

#     def backward_prop():
#         '''
#         Carry out backward propagation for one metaing sample
#         '''
#         pass

#     def split_data(self, test_size=0.2):
#         return train_test_split(self.X, self.Y, test_size)

#     def meta():
#         '''
#         meta the model by carrying out forward and backward propagation for all metaing samples
#         '''
#         pass

#     def predict():
#         '''
#         Make a prediction for given input data
#         '''

In [None]:
meta = pd.read_csv("../../datasets/MovieSummaries/movie.metadata.tsv", sep="\t", header=None)

In [None]:
meta.head()

In [None]:
meta.columns = ["movie_id", 1, 2, 3, 4, 5, 6, 7, "genre"]
meta = meta[['movie_id', 'genre']]
meta['movie_id'] = meta['movie_id'].astype(str)
meta.head()

In [None]:
rows = []
plots = []
movie_ids = []

with open("../../datasets/MovieSummaries/plot_summaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in reader:
        rows.append(row)    

for i in rows:
    movie_ids.append(i[0])
    plots.append(i[1])

In [None]:
movies = pd.DataFrame({"movie_id": movie_ids, "plot": plots})

In [None]:
movies = pd.merge(movies, meta, on="movie_id")
movies.head()

In [None]:
cleaned_genres = []
for i in movies['genre']:
    cleaned_genres.append(list(json.loads(i).values()))

movies['cleaned_genre'] = cleaned_genres

In [None]:
movies = movies[(movies['cleaned_genre'].str.len() != 0)]

In [None]:
movies.shape

In [None]:
movies = movies.drop('genre', axis=1)
movies.head()

In [None]:
all_genres = []
genre_freq = {}

for row in movies['cleaned_genre']:
    for genre in row:
        if genre not in all_genres:
            all_genres.append(genre)
            genre_freq[genre] = 1
        else:
            genre_freq[genre] += 1

In [None]:
len(all_genres)

In [None]:
top_genres = dict(sorted(genre_freq.items(), key = lambda x: x[1], reverse=True)[:10])
top_genres

In [None]:
plt.subplots(figsize=(18, 10), dpi=50)
plt.title("Most Frequent Genres")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.bar(top_genres.keys(), top_genres.values())
plt.show()

In [None]:
def preprocess(text: str):

    text = text.lower()
    text = text.replace("<br />", " ")  # Remove html
    text = text.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))  # Remove punctuations
    text = re.sub(r"\d", "", text)
    
    words = word_tokenize(text)
    stopwords_english = stopwords.words("English")
    for word in words:
        if word in stopwords_english:
            text = re.sub(r"\b%s\b" % word, "", text)

    text = re.sub(' +', ' ', text) # Remove extra spaces
    # text = text.strip()

    return text

In [None]:
print("Before cleanup: ", movies['plot'][0])
movies['plot'] = movies['plot'].map(preprocess)
print("After cleanup: ", movies['plot'][0])

In [None]:
def create_vocab(df):
    
    vocab = {}
    stopwords_english = stopwords.words("English")
    for plot in df.values:
        words = word_tokenize(plot)
        for word in words:
            if word not in stopwords_english: 
                if word not in vocab:
                    vocab[word] = 1
                else:
                    vocab[word] += 1

    return vocab  

In [None]:
vocab = create_vocab(movies['plot'])

In [None]:
top_words = dict(sorted(vocab.items(), key = lambda x: x[1], reverse=True)[:10])
top_words

In [None]:
plt.bar(top_words.keys(), top_words.values())
plt.title("Most Frequent Words")
plt.xlabel("Word")
plt.ylabel("Count")
plt.show()

In [None]:
# train, test = iterative_train_test_split(movies, test_size = 0.2)
train=movies.sample(frac=0.7)
test=movies.drop(train.index)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
all_genres = []
genre_freq = {}

for row in train['cleaned_genre']:
    for genre in row:
        if genre not in all_genres:
            all_genres.append(genre)
            genre_freq[genre] = 1
        else:
            genre_freq[genre] += 1

len(all_genres)

In [None]:
mb = MultiLabelBinarizer()
train_y = mb.fit_transform(train['cleaned_genre']) 
train_y = torch.tensor(train_y)
train_y.shape

In [None]:
tfidf = TfidfVectorizer(max_features=10000) 
tfidf.fit(train['plot'])
train_x = tfidf.transform(train['plot'])

In [None]:
train_x.shape, train_y.shape   

In [None]:
coo_train = train_x.tocoo()
values = coo_train.data
indices = np.vstack((coo_train.row, coo_train.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo_train.shape

train_x = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

In [None]:
train['plot'].shape, test.shape

In [None]:
def initialize_parameters(layer_dims):
   params ={}
   
   for l in range(1, len(layer_dims)):
      params[f'W{l}'] = torch.rand(layer_dims[l], layer_dims[l-1]) * 0.01
      params[f'b{l}'] = torch.zeros((layer_dims[l], 1))

   return params

In [None]:
def linear_forward(A_prev, W, b):

    Z = torch.mm(W, A_prev) + b
    cache = (A_prev, W, b)

    return Z, cache

In [None]:
def linear_activation_forward(A_prev, W, b, activation):

    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = torch.sigmoid(Z), Z
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = torch.relu(Z), Z
    cache = (linear_cache, activation_cache)

    return A, cache

In [None]:
def L_model_forward(X, params):

    caches = []
    L = len(params) // 2
    A_prev = X
    for l in range(1, L):
        A, cache = linear_activation_forward(A_prev, params[f'W{l}'], params[f'b{l}'], "relu")
        caches.append(cache)
        A_prev = A

    AL, cache = linear_activation_forward(A_prev, params[f'W{L}'], params[f'b{L}'], "sigmoid")
    caches.append(cache)

    return AL, caches

In [None]:
def compute_cost(AL, Y):

    m = Y.shape[1]
    cost = - torch.mean(((Y * torch.log(AL).transpose(0, 1)) + ((1 - Y) * torch.log(1 - AL).transpose(0, 1))))
    
    return torch.squeeze(cost)     


In [None]:
def linear_backward(dz, cache):

    A_prev, w, b = cache
    m = A_prev.shape[1]
    dw = torch.mm(dz, A_prev.transpose(0, 1)) / m
    db = torch.sum(dz, axis=1, keepdims=True) / m
    dA_prev = torch.mm(w.transpose(0, 1), dz)

    return dA_prev, dw, db

In [None]:
def sigmoid_backward(dA, cache):

    Z = cache
    s = 1 / (1 + torch.exp(-Z))
    dz = dA.transpose(0, 1) * s * (1 - s)

    return dz

In [None]:
def relu_backward(dA, cache):

    Z = cache
    dz = dA
    dz[Z <= 0] = 0

    return dz

In [None]:
def linear_activation_backward(dA, cache, activation):

    linear_cache, activation_cache = cache

    if activation == 'relu':
        dz = relu_backward(dA, activation_cache)

    elif activation == 'sigmoid':
        dz = sigmoid_backward(dA, activation_cache)

    dA_prev, dw, db = linear_backward(dz, linear_cache)

    return dA_prev, dw, db

In [None]:
def L_model_backward(AL, Y, caches):

    L = len(caches)
    m = Y.shape[1]
    grads = {}

    dAL = -(torch.div(Y, AL.transpose(0, 1)) - torch.div(1 - Y, 1 - AL.transpose(0, 1)))

    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = linear_activation_backward(dAL, caches[L-1], "sigmoid")
    for l in reversed(range(L-1)):
        dA_prev, dw, db = linear_activation_backward(grads[f'dA{l+1}'], caches[l], "relu")
        grads[f'dW{l+1}'] = dw
        grads[f'db{l+1}'] = db
        grads[f'dA{l}']  = dA_prev  
        
    return grads  

In [None]:
def update_parameters(params, grads, learning_rate):

    L = len(params) // 2
    for l in range(1, L+1):
        params[f'W{l}'] -= learning_rate * grads[f'dW{l}']
        params[f'b{l}'] -= learning_rate * grads[f'db{l}']

    return params

In [None]:
def predict(X_test, parameters):

    AL, caches = L_model_forward(X_test, parameters)
    
    AL[AL < 0.5] = int(0)
    AL[AL >= 0.5] = int(1)
    
    return torch.squeeze(AL)

In [None]:
def L_layer_model(X, Y, layers_dims, learning_rate = 0.5, num_iterations = 10, print_cost=False):
    costs = []                

    parameters = initialize_parameters(layers_dims)
    
    for i in range(0, num_iterations):
        AL, caches = L_model_forward(X, parameters)
        
        cost = compute_cost(AL, Y)
        costs.append(cost)
  
        grads = L_model_backward(AL, Y, caches)
     
        parameters = update_parameters(parameters, grads, learning_rate)
        if print_cost and i % 10 == 0:
            print (f"Cost after iteration {i}: {cost}")
       
    plt.title("Learning rate = " + str(learning_rate))
    plt.plot(range(1, num_iterations + 1), costs)
    plt.xlabel("No. of iterations")
    plt.ylabel("Cost Function")
    plt.show()

    return parameters

In [None]:
params = L_layer_model(train_x.transpose(0, 1), train_y, [10000, 10, train_y.shape[1]], num_iterations=100, print_cost=True)

In [None]:
test_x = tfidf.transform(test['plot'])

coo_test = test_x.tocoo()
values = coo_test.data
indices = np.vstack((coo_test.row, coo_test.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo_test.shape

test_x = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

In [None]:
test_x.shape