### In this script we transform the action category to wordembedding and conduct weakly supervised learning

In [114]:
# Load arguments
from utils.video_dataset import Dataset

import re
import torch
import string
import pickle
import pandas as pd
import utils.options
import torch.nn as nn
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

class Args():
    def __init__(self):
        self.lr = 0.0001
        self.dataset_name = 'Thumos14reduced'
        self.num_class = 20
        self.feature_size = 2048
        self.batch_size = 24
        self.max_seqlen = 750
        self.model_name = 'weakloc'
        self.pretrained_ckpt = None
        self.max_iter = 50000
        self.num_similar = 3
        self.checkpoint_path = './checkpoint/'
        self.annotation_path = './annotations/'
        self.I3D_path = './I3D_features/'


# Category to sentence
class_name = {1: ["baseball pitch", "throw a baseball", "baseball throw"],
             2: ["basketball dunk", "dunk a basketball", "slam dunk basketball"],
             3: ["billiards"],
             4: ["clean and jerk", "weight lifting movement"],
             5: ["cliff diving", "high diving", "diving"],
             6: ["cricket shot"],
             7: ["cricket bowling", "cricket movment", "bowl cricket"],
             8: ["diving", "jumping into water", "falling into water"],
             9: ["frisbee catch", "catch frisbee"],
            10: ["golf swing", "golf stroke"],
            11: ["hammer throw", "throw a hammer"],
            12: ["high jump"],
            13: ["javelin throw", "throw a spear"],
            14: ["long jump"],
            15: ["pole vault", "a person uses a long flexible pole to jump over a bar"],
            16: ["shot put"],
            17: ["soccer penalty"],
            18: ["tennis swing"],
            19: ["throw discus", "discus"],
            20: ["volleyball spiking", "volleyball", ]}

In [115]:
args = Args()

# Load the dataset
dataset = Dataset(args)

# Word Embedding Loading from GLOVE
path_to_glove = './checkpoint/glove.840B.300d.pkl'

with open(path_to_glove, "rb") as input_file:
    glove_model = pickle.load(input_file)

In [104]:
# Build up the language model and LSTM

class Visual_model(nn.Module):
    """Args:
    feature_dim: dimension of the feature from I3D model.
    """

    def __init__(self, feature_dim):
        super(Visual_model, self).__init__()
        
        self.feature_dim = feature_dim
        self.fc0 = nn.Linear(feature_dim, 1024)
        self.fc1 = nn.Linear(1024, 256)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, features_list):
        """Build the attention module.

        Args:
        features_list: (batch_size, num_frame, feat_depth)

        Returns:
        The attention weights, weigted features
        """
        
        attention_weights = []
        weighted_features = []
        
        # Iterate through batch
        for idx, video_features in enumerate(features_list):
                        
            # Trunk feature into real length
            seq_len = (torch.abs(video_features).max(dim=1)[0] > 0).sum().tolist()
            video_features = video_features[: seq_len, :]
            
            # Iterate through video segments
            output = self.sigmoid(self.fc2(self.relu(self.fc1(self.relu(self.fc0(video_features))))))

            # Temporal Pool
            weighted_pooling = (output*video_features).sum(0)/video_features.shape[0]
            
            # Save weights/features
            output = output.reshape(output.shape[0])
            attention_weights.append(output)
            weighted_features.append(weighted_pooling)
            
        # Reshape to tensor
        weighted_features = torch.stack(weighted_features)
        
        return attention_weights, weighted_features



class Language_encoder(nn.Module):
    """Args:
    natural language text
    """

    def __init__(self, path_to_glove = './checkpoint/glove.840B.300d.pkl'):
        super(Language_encoder, self).__init__()
        """
            Load GLOVE pre-trained model first
        """
        with open(path_to_glove, "rb") as input_file:
            self.glove = pickle.load(input_file)
        self.wordnet_lemmatizer = WordNetLemmatizer() 

    def language_preprocess(self, input_str):
        # convert to lowercase
        input_str = input_str.lower()
        # remove numbers
        input_str = re.sub(r'\d+', '', input_str)
        # remove punctuation
        input_str = re.sub(r'[^\w\s]','',input_str)
        # remove whitespaces
        input_str = input_str.strip()
        # remove stop words
        stop_words = set(ENGLISH_STOP_WORDS)
        tokens = word_tokenize(input_str)
        words = [i for i in tokens if not i in stop_words]
        # stemming the words
        words = [self.wordnet_lemmatizer.lemmatize(word) for word in words]
        return words
    
    def forward(self, text):
        text = self.language_preprocess(text)
        
        return text



class Model(nn.Module):
    """Args:
    feature_dim: dimension of the feature from I3D model.
    """

    def __init__(self, feature_dim):
        super(Model, self).__init__()
        self.visual_model = Visual_model()
        self.language_encoder = Language_encoder()


    def forward(self, features_list, text):
        """Build model architecture.
           Consists of two parts: Visual Model + Language Encoder.
        """
        visual_feature = self.visual_model(features_list)
        word_vector = self.language_encoder(text)
        
        return visual_feature, word_vector

In [105]:
l = Language_encoder()

In [108]:
import torch
import torch.nn as nn
from torch.autograd import Variable

time_steps = 10
batch_size = 3
in_size = 5
classes_no = 7

model = nn.LSTM(in_size, classes_no, 2)
input_seq = Variable(torch.randn(time_steps, batch_size, in_size))
output_seq, _ = model(input_seq)
last_output = output_seq[-1]

loss = nn.CrossEntropyLoss()
target = Variable(torch.LongTensor(batch_size).random_(0, classes_no-1))
err = loss(last_output, target)
err.backward()

In [112]:
# Here we define our model as a class
class LSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1,
                    num_layers=2):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        # Define the LSTM layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, input):
        # Forward pass through LSTM layer
        # shape of lstm_out: [input_size, batch_size, hidden_dim]
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).
        lstm_out, self.hidden = self.lstm(input.view(len(input), self.batch_size, -1))
        
        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1))
        return y_pred.view(-1)

lstm_input_size = 200
hidden_dim = 300
num_train = 10
output_dim = 20
num_layers = 2

model = LSTM(lstm_input_size, hidden_dim, batch_size=num_train, output_dim=output_dim, num_layers=num_layers)