[View in Colaboratory](https://colab.research.google.com/github/juglar-diaz/STTD/blob/master/RepresentRNN.ipynb)

#Intro

In [0]:
!pip install -U -q PyDrive

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
!pip3 install torch

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/49/0e/e382bcf1a6ae8225f50b99cc26effa2d4cc6d66975ccf3fa9590efcbedce/torch-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (519.5MB)
[K    100% |████████████████████████████████| 519.5MB 31kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x5a1c2000 @  0x7ffa5b10d1c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8
[?25hInstalling collected packages: torch
Successfully installed torch-0.4.1


In [4]:
print(accelerator)

cu80


In [0]:
from collections import Counter
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import itertools

import torch.nn as nn
import torch.optim as optim
import random
torch.manual_seed(1)


import pickle

import os
sep = os.sep
import os.path

data = ""
import pandas as pd
import bisect
import time

In [0]:
if torch.cuda.is_available():
    inGPU = True
    _type = torch.cuda.FloatTensor
    _typelong = torch.cuda.LongTensor
    device = torch.device("cuda")
else:
    inGPU = False
    _type = torch.FloatTensor
    _typelong = torch.LongTensor
    device = torch.device("cpu")
    

#Data

In [0]:
exchangedrive = drive.CreateFile({'id':'1cMCzlvTMlUPgaYaUIdlMVtTpp8r10GnX'})
exchangedrive.GetContentFile('robosclean.p')

In [0]:
exchangedrive = drive.CreateFile({'id':'1A5Wa6LiaGs8XeW2S2qjGbcoSSlsMGW97'})
exchangedrive.GetContentFile('tweetsLAtrain.csv')
exchangedrive = drive.CreateFile({'id':'1CrUCS7oWzdvYoFwtWgikiGkDAu6w3dOF'})
exchangedrive.GetContentFile('tweetsLAtest.csv')

In [0]:
exchangedrive = drive.CreateFile({'id':'1NuSVM7-h0CCRtzi0woM4h5tVd6T7JO2C'})
exchangedrive.GetContentFile('tweetsNYtrain.csv')
exchangedrive = drive.CreateFile({'id':'1UYZY0sh1-Q8MIofMAHDGaNKv8cfDA0ui'})
exchangedrive.GetContentFile('tweetsNYtest.csv')

In [0]:
exchangedrive = drive.CreateFile({'id':'1Jv19eJTZwsZWEA_rUudvKwcn1TS-DFBa'})
exchangedrive.GetContentFile('tweets2016_2half.csv')
exchangedrive = drive.CreateFile({'id':'1E8WlhOXb3tfQbLUpizx7QXzeydv57eXN'})
exchangedrive.GetContentFile('toy_2017_Jan.csv')

In [0]:
def buildIndexData(list_elements, start_index = 0):

    idx2data = {index + start_index: discretization for index, discretization in enumerate(set(list_elements))}

    data2idx = {discretization: index for index, discretization in idx2data.items()}

    indexes = [data2idx[element] for element in list_elements]

    return indexes, data2idx, idx2data


class Discretize:
    def fit_transform(self):
        pass

    def transform(self):
        pass

    def updateIndexes(self, indexes, star_index):
        map_indexes = {value:counter+star_index for counter, value in enumerate(set(indexes))}


        new_indexes = [map_indexes[index] for index in indexes]

        idx2data = {map_indexes[index]:self.idx_data[index] for index in set(indexes)}
        data2idx = {val:key for (key, val) in idx2data.items()}

        self.idx_data = idx2data
        self.data_idx = data2idx

        return new_indexes





In [0]:
class Round(Discretize):
    def __init__(self, div= 10):
        self.div= div

    def fit_transform(self, latitudes, longitudes, start_index = 0, vocab_size = -1, vocab_min_count=0):
        lat = (latitudes * self.div).astype(int)
        lon = (longitudes * self.div).astype(int)

        discretizations = list(zip(lat, lon))

        counter = Counter(discretizations)
        if (vocab_size > 0):
            pairs = counter.most_common(vocab_size)
        else:
            pairs = list(counter.items())

        self.vocab = [keyword for keyword, count in pairs if count >= vocab_min_count]

        self.indexes, self.data_idx, self.idx_data = buildIndexData(self.vocab, start_index)



        return [self.data_idx.get(data, None) for data in discretizations], self.data_idx, self.idx_data



    def transform(self, latitudes, longitudes):
        lat = (latitudes * self.div).astype(int)
        lon = (longitudes * self.div).astype(int)

        discretizations = list(zip(lat, lon))
        #indexes = [self.data_idx.get(data, None) for data in discretizations]
        #return indexes
        return discretizations

In [0]:
class HourofDay(Discretize):
    def fit_transform(self, created_at, start_index = 0):
        indi = pd.DatetimeIndex(created_at)
        discretizations = list(indi.hour)
        self.indexes, self.data_idx, self.idx_data = buildIndexData(discretizations, start_index)
        return self.indexes, self.data_idx, self.idx_data


    def transform(self, created_at):
        indi = pd.DatetimeIndex(created_at)
        discretizations = list(indi.hour)
        #indexes = [self.data_idx.get(data,None) for data in discretizations]
        #return indexes
        return discretizations

class DayofWeak(Discretize):
    def fit_transform(self, created_at, start_index = 0):
        indi = pd.DatetimeIndex(created_at)

        discretizations = list(indi.weekday)
        self.indexes, self.data_idx, self.idx_data = buildIndexData(discretizations, start_index)
        return self.indexes, self.data_idx, self.idx_data

    def transform(self, created_at):
        indi = pd.DatetimeIndex(created_at)
        discretizations = list(indi.weekday)
        indexes = [self.data_idx.get(data,None) for data in discretizations]
        return indexes


class HourofDay_DayofWeak(Discretize):
    def fit_transform(self, created_at, start_index = 0):
        indi = pd.DatetimeIndex(created_at)

        discretizations = list(zip(indi.weekday, indi.hour))
        self.indexes, self.data_idx, self.idx_data = buildIndexData(discretizations, start_index)
        return self.indexes, self.data_idx, self.idx_data

    def transform(self, created_at):
        indi = pd.DatetimeIndex(created_at)

        discretizations = list(zip(indi.weekday, indi.hour))
        #indexes = [self.data_idx.get(data, None) for data in discretizations]
        #return indexes        
        return discretizations

In [0]:

class Indexer():
    def __init__(self):
        pass

    def fit_transform(self,
            filename,
            #time_discretizer = HourofDay,
            time_discretizer = HourofDay,
                      
            coor_discretizer = Round,
            #represent_text = RepresentText,
            dates_vocab_size = 0, dates_vocab_mincount = 0,
            places_vocab_size = 0, places_vocab_mincount = 0,
            words_vocab_size = 0, words_vocab_mincount = 0): #file_csv has columns created_at, latitude, longitude, text

        #self.datapath = "Data" + sep
        if (filename.split('.')[1] == 'csv'):
            df = pd.read_csv(filename)
        elif (filename.split('.')[1] == 'p'):
            df = pd.read_pickle(filename)

        self.time_discretizer = time_discretizer()
        self.coor_discretizer = coor_discretizer(100)
        #self.represent_text = represent_text()

        #date_out, self.date_idx, self.idx_date = self.time_discretizer.fit_transform(df['created_at'], start_index = 0)
        dates = self.time_discretizer.transform(df['created_at'])

        #self.coor_out, self.coor_idx, self.idx_coor = self.coor_discretizer.fit_transform(df['latitude'], df['longitude'],start_index=max(self.date_out) + 1)

        places = self.coor_discretizer.transform(df['latitude'], df['longitude'])

        
        texts = df['texts'].astype(str)
        words = [word for list_words in texts for word in list_words.split()]
        #print(len(texts))


        counter_dates = Counter(dates)
        if (dates_vocab_size > 0):
            pairs = counter_dates.most_common(dates_vocab_size)
        else:
            pairs = list(counter_dates.items())
        self.vocab_dates = set([date for date, count in pairs if count >= dates_vocab_mincount])


        counter_places = Counter(places)
        if (places_vocab_size > 0):
            pairs = counter_places.most_common(places_vocab_size)
        else:
            pairs = list(counter_places.items())
        self.vocab_places = set([place for place, count in pairs if count >= places_vocab_mincount])


        counter_words = Counter(words)
        if (words_vocab_size > 0):
            pairs = counter_words.most_common(words_vocab_size)
        else:
            pairs = list(counter_words.items())

        self.vocab_words = set([keyword for keyword, count in pairs if count >= words_vocab_mincount])

        filtered_dates = set([i for i in range(len(dates)) if dates[i] in self.vocab_dates ])
        filtered_places = set([i for i in range(len(places)) if places[i] in self.vocab_places])
        filtered_words = set([i for i in range(len(texts)) if any([word in self.vocab_words for word in texts[i].split()]) ])

        #print(len(filtered_dates))
        #print(len(filtered_places))
        #print(len(filtered_words))

        filtered = list(filtered_dates.intersection(filtered_places).intersection(filtered_words))
        #print(len(filtered))

        dates = [dates[i] for i in filtered]
        places = [places[i] for i in filtered]
        texts = [texts[i] for i in filtered]


        idxsdates, self.date2idx, self.idx2date = buildIndexData(dates, start_index=0)
        idxsplaces, self.place2idx, self.idx2place = buildIndexData(places, start_index=max(idxsdates) + 1)

        idxs, self.word2idx, self.idx2word = buildIndexData(self.vocab_words, start_index = max(idxsplaces) + 1)

        idxstexts = []

        for text in texts:
            indexed_text = [self.word2idx[word] for word in text.split() if word in self.vocab_words]
            idxstexts.append(indexed_text)

        self.idx2item = {}
        self.idx2item.update(self.idx2word)
        self.idx2item.update(self.idx2place)
        self.idx2item.update(self.idx2date)

        self.item2idx = {}
        self.item2idx.update(self.word2idx)
        self.item2idx.update(self.place2idx)
        self.item2idx.update(self.date2idx)

        return list(zip(idxsdates,
                             idxsplaces,
                             idxstexts))


    def transform(self, filename):
        if (filename.split('.')[1] == 'csv'):
            df = pd.read_csv( filename)
        elif (filename.split('.')[1] == 'p'):
            df = pd.read_pickle( filename)

        dates = self.time_discretizer.transform(df['created_at'])
        idxsdates = [self.date2idx.get(date, None) for date in dates]

        places = self.coor_discretizer.transform(df['latitude'], df['longitude'])
        idxsplaces = [self.place2idx.get(place, None) for place in places]

        idxstexts = []
        for text in df['texts'].astype(str):
            indexed_text = [self.word2idx[word] for word in text.split() if word in self.vocab_words]
            idxstexts.append(indexed_text)


        full_list = list(zip(idxsdates,
                             idxsplaces,
                             idxstexts))
        #print(len(full_list))
        clean_list = [(x[0], x[1], x[2]) for x in full_list if ((x[0] != None) and (x[1] != None) and (x[2] != [])) ]
        return clean_list


    def Item2index(self, item):
        return self.item2idx.get(item, -1)

    def Index2item(self, index):
        return self.idx2item.get(index, None)

    def indexes(self):
        return (self.coor_out[0], self.coor_out[-1], self.texts_out[-1])

In [15]:
%%time
indexerST = Indexer()
trainST = indexerST.fit_transform(data+'tweets2016_2half.csv', dates_vocab_mincount=0, words_vocab_mincount=100, places_vocab_mincount=10)
testST = indexerST.transform(data+'toy_2017_Jan.csv')

CPU times: user 11 s, sys: 225 ms, total: 11.2 s
Wall time: 11.2 s


In [16]:
%%time
indexerLA = Indexer()
trainLA = indexerLA.fit_transform(data+'tweetsLAtrain.csv', dates_vocab_mincount=0, words_vocab_mincount=100, places_vocab_mincount=10)
testLA = indexerLA.transform(data+'tweetsLAtest.csv')

CPU times: user 40.9 s, sys: 768 ms, total: 41.7 s
Wall time: 41.7 s


In [17]:
%%time
indexerNY = Indexer()
trainNY = indexerNY.fit_transform(data+'tweetsNYtrain.csv', dates_vocab_mincount=0, words_vocab_mincount=100, places_vocab_mincount=10)
testNY = indexerNY.transform(data+'tweetsNYtest.csv')

CPU times: user 11.5 s, sys: 83.2 ms, total: 11.6 s
Wall time: 11.6 s


In [18]:
t = torch.zeros([150,2,100])
print(t.shape)
print(torch.sum(t, dim=1).shape)

torch.Size([150, 2, 100])
torch.Size([150, 100])


In [0]:
e = nn.Embedding(10, 5)

In [20]:
a = e(torch.tensor([[1],[2]]))
a

tensor([[[-0.6092, -0.9798, -1.6091, -0.7121,  0.3037]],

        [[-0.7773, -0.2515, -0.2223,  1.6871,  0.2284]]],
       grad_fn=<EmbeddingBackward>)

In [21]:
a.shape

torch.Size([2, 1, 5])

In [22]:
c = torch.zeros(10)
c.shape

torch.Size([10])

In [23]:
t = torch.ones([3,2])
v = torch.tensor([[2.0],[1.5],[3.0]])
print(v.shape)
print(t.shape)
z = v * t
z.shape

torch.Size([3, 1])
torch.Size([3, 2])


torch.Size([3, 2])

In [24]:
emb = nn.Embedding(10, 5)
emb.weight.data[0] = torch.zeros(5)
emb.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0103,  0.9837,  0.8793, -0.9962, -0.8313],
        [-0.4610, -0.5601,  0.3956, -0.9823,  1.3264],
        [ 0.8547, -0.6540,  0.7317, -1.4344, -0.5008],
        [ 0.1716, -0.1600, -0.5047, -1.4746, -1.0412],
        [ 0.7323, -1.0483, -0.4709,  0.2911,  1.9907],
        [-0.9247, -0.9301,  0.8165, -0.9135,  0.2053],
        [ 0.3051,  0.5357, -0.4312,  0.1573,  1.2540],
        [ 1.3275, -0.4954, -1.9804,  1.7986,  0.1018],
        [ 0.3400, -0.6447, -0.2870,  3.3212, -0.4021]])

#Models

In [0]:
class Embed(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Embed, self).__init__()
        self.embed_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    def forward(self, input):
        return self.embedding(input)
        


class EncoderRNN(nn.Module):

    def __init__(self, input_size, hidden_size, embed, batch_size):
        super(EncoderRNN, self).__init__()
        self.embedding = embed
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.batch_size = batch_size
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, self.batch_size, -1)
        #print(input)
        #print(embedded)
        embedded = F.relu(embedded)
        
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.batch_size, self.hidden_size, device=device)
    
      
class DecoderRNN(nn.Module):

    def __init__(self, input_size, hidden_size, embed, range_times, range_coors, range_words, batch_size):
        super(DecoderRNN, self).__init__()
        self.embedding = embed
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.batch_size = batch_size
        
        self.linearTimes = nn.Linear(hidden_size, max(range_times)-min(range_times)+1, bias=False)
        self.linearCoors = nn.Linear(hidden_size, max(range_coors)-min(range_coors)+1, bias=False)
        self.linearWords = nn.Linear(hidden_size, max(range_words)-min(range_words)+1, bias=False)
        
    def forward(self, inputs, hidden, target):
      
        #embeds = self.embedding(inputs).view(1, 1, -1)
        
        
        
        #embeds = embeds.view((self.batch_size,-1))
        
        
        if target == 0:
            out = self.linearTimes(hidden)
            #print(hidden)
            #print(out)

            log_probs = self.softmax(out.view(self.batch_size,-1))
            #print(log_probs)
            return log_probs#.view(log_probs.numel())  

        elif target == 1:
            out = self.linearCoors(hidden)
            log_probs = self.softmax(out.view(self.batch_size,-1))

            return log_probs#.view(log_probs.numel())            
                        

        else:#target in range_words
          
            embeds = self.embedding(inputs).view(1, self.batch_size, -1)
            #print(inputs.shape)
            #print(hidden.shape)
            embeds = F.relu(embeds)
            output, hidden = self.gru(embeds, hidden)
            #print(output.shape)
          
            output = self.softmax(self.linearWords(output[0]))
            return output, hidden
        
      
    def initHidden(self):
        return torch.zeros(1, self.batch_size, self.hidden_size, device=device)
 

In [0]:
class SimpleRNN():
    def __init__(self):
        pass

    def transform_as_nparray(self, tuples_train):
        output = []
        self.max_length = 0
        for (aux_date_idx, aux_coor_idx, text) in tuples_train:
            if(len(text) > self.max_length):
                self.max_length = len(text)
            
            for word_numb in text:
                _tuple = (aux_date_idx, aux_coor_idx, word_numb)
                output.append(_tuple)
        print(self.max_length)
        return np.array(output)
    
    def transform_as_seq(self, tuples_train):
        output = {0:[], 1:[], 2:[]}
        for _tuple in tuples_train:
                _tuple2 = _tuple[2]
                for i in range(self.max_length-len(_tuple2)):
                    _tuple2.append(self.padding)
                
                tuple = ([_tuple[0]],[_tuple[1]],_tuple[2])
                    
                for i in range(3):
                    target = [t-self.min_ranges[i] for t in tuple[i]]
                     
                    data = [k for j in range(3) if (j != i) for k in tuple[j]]
                    output[i].append((data, target))
        
        return output
    
    
    def next_batch(self):
        type_target = random.sample(range(3),1)[0]
        examples = random.sample(self.seq_train[type_target], self.batch_size)
        data = [data for data,target in examples]
        targets = [target for data,target in examples]
        if (type_target != 2):        
            return np.array(data).transpose(),np.array(targets), type_target
        else:
            return np.array(data).transpose(),np.array(targets).transpose(), type_target
          
    
    def get_ranks(self, test, predictor, predict_type = 'w'):
        self.predict_type = predict_type
        noiseList = np.random.choice(len(test), self.fake_num*len(test)).tolist()
        count = 5
        for example in test:

            scores = []
            score = predictor.predict(example[0], example[1], example[2])
            scores.append(score)


            for i in range(self.fake_num):
                noise = test[noiseList.pop()]
                if self.predict_type == 't':
                    noise_score = predictor.predict(noise[0], example[1], example[2])
                elif self.predict_type=='l':
                    noise_score = predictor.predict(example[0], noise[1], example[2])
                elif self.predict_type=='w':
                    noise_score = predictor.predict(example[0], example[1], noise[2])
                scores.append(noise_score)
            scores.sort()
    
    def fit_batch(self, tuples_train, embedding_dims, num_epochs = 1, hidden_size=100, batch_size=10):
        
        xytrain = self.transform_as_nparray(tuples_train)
        vocabulary_size = int(max(list(xytrain[:, -1]))) + 1
        
        self.padding = vocabulary_size
        vocabulary_size += 1
        
        self.batch_size = batch_size
        
        
        range_times = range(int(max(list(xytrain[:, -3]))) + 1)
        range_coors = range(int(max(list(xytrain[:, -3]))) + 1, int(max(list(xytrain[:, -2]))) + 1)
        range_words = range(int(max(list(xytrain[:, -2]))) + 1, vocabulary_size)
        self.min_ranges = [min(range_times), min(range_coors), min(range_words)]
        print(len(tuples_train))
        #tuples_train = tuples_train[:5000]
        numexamples = len(tuples_train)
        
        self.seq_train = self.transform_as_seq(tuples_train)
        
        
        
        
        losses = []
        loss_function = nn.NLLLoss()
      
        embed = Embed(vocabulary_size, embedding_dims).to(device)
        optimizer_emb = optim.Adam(embed.parameters(), lr=0.001)
        
        encoder = EncoderRNN(embedding_dims, hidden_size, embed, batch_size).to(device)
        optimizer_enc = optim.Adam(encoder.parameters(), lr=0.001)
        
        decoder = DecoderRNN(embedding_dims, hidden_size, embed, range_times, range_coors, range_words, batch_size).to(device)
        optimizer_dec = optim.Adam(decoder.parameters(), lr=0.001)
        
        
    
        for epoch in range(num_epochs):
            total_loss = 0
            #for _ in range(3):
            for _ in range(3*numexamples//self.batch_size):
                
                input_seq, target, type_target = self.next_batch()
                
                input_seq = torch.tensor(input_seq, device=device)
                target = torch.tensor(target, device=device)
                
                optimizer_dec.zero_grad()
                optimizer_enc.zero_grad() 
                #optimizer_emb.zero_grad()
                
                loss = 0
                input_length = input_seq.shape[0]
                
                #encoder_outputs = torch.zeros(input_length, encoder.hidden_size, device=device)
                
                encoder_hidden = encoder.initHidden()
                for ei in range(input_length):
                    encoder_output, encoder_hidden = encoder(input_seq[ei], encoder_hidden)
                    #encoder_outputs[ei] = encoder_output[0, 0]
                
                decoder_hidden = encoder_hidden
                decoder_input = input_seq[-1]#[input_length-1].view(1, self.batch_size, -1)
                
                if (type_target == 2):
                    target_length = input_seq.shape[0]
                    decoder_outputs = torch.zeros(target_length, decoder.hidden_size, device=device)
                    target_seq = torch.tensor(target, device=device)
                  
                  
                    for di in range(target_length):
                        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, type_target)
                        loss += loss_function(decoder_output, target_seq[di])
                        decoder_input = target[di] 
                        decoder_outputs[di] = decoder_output[0, 0]
                else:
                    log_probs = decoder(decoder_input, decoder_hidden, type_target)
                  
                    loss = loss_function(log_probs, target.view(target.numel()))
                
                loss.backward()
                total_loss += loss.item()
                optimizer_dec.step()
                optimizer_enc.step() 
                #optimizer_emb.step()
                
                encoder_hidden = encoder_hidden.detach()
            #if epoch % (num_epochs//10 +1) == 0:
            #  print(total_loss/numexamples)
            print(total_loss/numexamples)
        self.embed = embed
        self.encoder = encoder
        self.decoder = decoder
    
    def predict(self, tuples_test, batch_test=1000, predict_type = 't'):
        batch_test = self.batch_size
        self.seq_test = self.transform_as_seq(tuples_test)
        print(len(self.seq_test[0]))
        
        
        self.predict_type = predict_type
        convert = {'w':2,'l':1,'t':0}
        
        encoder = self.encoder
        decoder = self.decoder
        
        type_target = convert[predict_type]
        multiple  = (len(self.seq_test[type_target])//batch_test)*batch_test
        #multiple = batch_test
        it = iter(self.seq_test[type_target][:multiple])
        results = []
        results_targets = []
        while True:
            #examplestest = list(itertools.islice(it, 10))
            #print(examplestest)
            
            examples = list(itertools.islice(it, batch_test))
            if examples:
                
                data = [data for data,target in examples]
                targets = [target for data,target in examples]

                if (type_target != 2):        
                    input_seq = np.array(data).transpose()
                    target = np.array(targets)
                
                else:
                    input_seq = np.array(data).transpose()
                    target = np.array(targets).transpose()
                #print(input_seq)
                input_seq = torch.tensor(input_seq, device=device)
                target = torch.tensor(target, device=device)

                input_length = input_seq.shape[0]

                encoder_hidden = encoder.initHidden()
                for ei in range(input_length):
                    encoder_output, encoder_hidden = encoder(input_seq[ei], encoder_hidden)
                    #encoder_outputs[ei] = encoder_output[0, 0]

                decoder_hidden = encoder_hidden
                decoder_input = encoder_output#[input_length-1].view(1, self.batch_size, -1)

                if (type_target == 2):
                    target_length = input_seq.shape[0]
                    decoder_outputs = torch.zeros(target_length, decoder.hidden_size, device=device)
                    target_seq = torch.tensor(target, device=device)


                    for di in range(target_length):
                        input = target_seq[di]
                        decoder_output, decoder_hidden = decoder(input, decoder_hidden, type_target)
                        loss += loss_function(decoder_output, input)
                        #decoder_input = target[di] 
                        decoder_outputs[di] = decoder_output[0, 0]
                else:
                    log_probs = decoder(decoder_input, decoder_hidden, type_target)
                    results.append(log_probs)
                    #r = range(log_probs.shape[0])
                    #z = zip(r,targets)
                    #l = list(z)
                    #torch.tensor(l)
                    results_targets.append(targets)

                encoder_hidden = encoder_hidden.detach()
            
            else:
                break
   
        return results, results_targets
        
        
        
    def mrr(self,results,results_targets):
        
        all_scores = []
        all_score = []
        fake = 10
        for i in range(len(results)):
            for j in range(results[i].shape[0]):
                all_score.append(results[i][j][results_targets[i][j]].item())
                l = [results[i][j][k].item()  for k in random.choices(range(results[i].shape[1]),k=fake)]
                l.append(results[i][j][results_targets[i][j]].item())
                l.sort()
                all_scores.append(l)
        
        
        r = []
        for i in range(len(all_score)):
            score = all_score[i]
            scores = all_scores[i]
            rank = len(scores)+1-(bisect.bisect_left(scores,score)+bisect.bisect_right(scores,score)+1)/2.0
            r.append(rank)
    
        reciprocal_ranks = [1/rank for rank in r]
        mrr = sum(reciprocal_ranks)/len(reciprocal_ranks)
        mr = sum(r)/len(r)
        return round(mrr,4), round(mr,4)

In [0]:
%%time
translate = {'w':'Text','l':'Location','t':'Time'}
trainsets = [trainST, trainLA, trainNY]
testsets = [testST, testLA, testNY]
namesets = ['Santiago','LA','NY']
#results = {'Model':[],'Dataset':[],  'Text':[], 'Location':[],'Time':[]}
results = {'Model':[],'Dataset':[], 'Location':[],'Time':[]}

for train,test,name in list(zip(trainsets, testsets, namesets))[:1]:
    
    dims = [100]
    batchs = [250]
    
    
    for dim in dims:
        for batch in batchs:
            model = SimpleRNN()
            model.fit_batch(train, embedding_dims=dim, num_epochs=7, hidden_size=dim, batch_size=batch)

            for predict_type in 'lt':
                resultsp, results_targets = model.predict(test, predict_type=predict_type)
                mrr1 = model.mrr(resultsp, results_targets)[0]
                print(mrr1)
                print("mrr")

                results[translate[predict_type]].append(mrr1)

            results['Dataset'].append(name)

            results['Model'].append(str(type(model))+'_'+str())

            df = pd.DataFrame(results)

            df.to_csv('resultsemb2.df')

16
251115
0.08206666863392116
0.07065262980489889


In [57]:
%%time
translate = {'w':'Text','l':'Location','t':'Time'}
trainsets = [trainST, trainLA, trainNY]
testsets = [testST, testLA, testNY]
namesets = ['Santiago','LA','NY']
#results = {'Model':[],'Dataset':[],  'Text':[], 'Location':[],'Time':[]}
results = {'Model':[],'Dataset':[], 'Location':[],'Time':[]}

for train,test,name in list(zip(trainsets, testsets, namesets))[2:]:
    
    dims = [50,100]
    
    
    
    batchs = [1000,500,250,100]
    
    
    for dim in dims:
        for batch in batchs:
            model = SimpleRNN()
            model.fit_batch(train, embedding_dims=dim, num_epochs=40, hidden_size=dim, batch_size=batch)

            for predict_type in 'lt':
                resultsp, results_targets = model.predict(test, predict_type=predict_type)
                mrr1 = model.mrr(resultsp, results_targets)[0]
                print(mrr1)
                print("mrr")

                results[translate[predict_type]].append(mrr1)

            results['Dataset'].append(name)

            results['Model'].append(str(type(model))+'_'+str())

            df = pd.DataFrame(results)

            df.to_csv('resultsemb2.df')

14
317707
0.014784225208567564
0.009671279903598395
0.008829599864243146
0.008529818023160296
0.008030825267029962
0.007937913273384247
0.007564301380011189
0.007681143929259549
0.007110372002230767
0.00742339736437141
0.007242952342258002
0.007393056970974127
0.007092512420357726
0.0073772167003076225
0.007148606751725535
0.0071383064724915195
0.007182187239811504
0.007275316066649539
0.007184540687862065
0.006851347661944127
0.007012828193746939
0.006870763007013571
0.006773727100554582
0.0070534727674710996
0.0069991381044311055
0.007090403544054323
0.007004500174339516
0.006939810087970884
0.006769300000126148
0.006692421043047152
0.007051509418620284
0.007059902679860649
0.006967499979690332
0.006744425429981232
0.0068245530169767825
0.006968659727145203
0.007071946683234799
0.0068362002728012804
0.006796474252551612
0.006916968281852082
77746
0.9298
mrr
77746
0.4281
mrr
14
317707
0.025351865777940446


KeyboardInterrupt: ignored

In [68]:
%%time
translate = {'w':'Text','l':'Location','t':'Time'}
trainsets = [trainST, trainLA, trainNY]
testsets = [testST, testLA, testNY]
namesets = ['Santiago','LA','NY']
#results = {'Model':[],'Dataset':[],  'Text':[], 'Location':[],'Time':[]}
results = {'Model':[],'Dataset':[], 'Location':[],'Time':[]}

for train,test,name in list(zip(trainsets, testsets, namesets))[2:]:
    
    dims = [50,100]
    
    
    
    batchs = [1000,500,250,100]
    
    
    for dim in dims:
        for batch in batchs:
            model = SimpleRNN()
            model.fit_batch(train, embedding_dims=dim, num_epochs=10, hidden_size=dim, batch_size=batch)

            for predict_type in 'lt':
                resultsp, results_targets = model.predict(test, predict_type=predict_type)
                mrr1 = model.mrr(resultsp, results_targets)[0]
                print(mrr1)
                print("mrr")

                results[translate[predict_type]].append(mrr1)

            results['Dataset'].append(name)

            results['Model'].append(str(type(model))+'_'+str())

            df = pd.DataFrame(results)

            df.to_csv('resultsemb2.df')

14
317707
0.011613858806086126
0.00720379506351147
0.006666679903151712
0.005977282488135828
0.005777304285767592
0.005420811765169754
0.005332260364128921
0.005289745244325139
0.00536077715498667
0.00503271908312541
77746
0.9265
mrr
77746
0.4118
mrr
14
317707
0.020096860270693278
0.013482907833978975
0.01203330023458646
0.011510590771188668
0.010767187888385857
0.010640679807543662
0.010295348789665742
0.01019478900879058
0.009933253941012613
0.01017891483327144
77746
0.9283
mrr
77746
0.4146
mrr
14
317707
0.03381158618253653
0.02386464756136601
0.021793696180801667
0.020861264540544575
0.0204247453904059
0.019747836933905066
0.019525646088226756
0.019155810782660373
0.01923723460010268
0.018833513446348725
77746
0.9314
mrr
77746
0.4153
mrr
14
317707
0.07295304885134635
0.05499347818580456
0.05080554072981617
0.05039433144155547
0.04873439414689478
0.04837037368766092
0.047272187548358324
0.04686696527477374
0.04736675702799198
0.0461671987350877
77746
0.9329
mrr
77746
0.4145
mrr
14
31

RuntimeError: ignored

In [26]:
df.head()

Unnamed: 0,Dataset,Location,Model,Time
0,NY,0.3437,<class '__main__.SimpleRNN'>_,0.3024
