In [8]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import multiprocessing
import seaborn as sns
import pickle

from gensim.corpora.dictionary import Dictionary

# Label dimensionality reduction
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import FeatureAgglomeration
from gensim.models import LdaModel, LsiModel, RpModel
from scipy.sparse import csr_matrix

# Feature generation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Deep Learning
import keras
import tensorflow as tf
from keras import Model
from keras.layers import Dense, Dropout, Embedding, Flatten, LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import jaccard_score, hamming_loss, zero_one_loss, multilabel_confusion_matrix, roc_auc_score
from sklearn.metrics import make_scorer

In [9]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [10]:
with open('gensim_dictionary.pickle', 'rb') as f:
    dictionary = pickle.load(f)
    
with open('tokenized_nostops_descriptions.pickle', 'rb') as f:
    tokenized_descriptions = pickle.load(f)

In [11]:
df = pd.read_pickle('df_cats.pickle')
df.head(1)

Unnamed: 0,id,name,description,categories_Abstract Strategy,categories_Action / Dexterity,categories_Adventure,categories_Age of Reason,categories_American Civil War,categories_American Indian Wars,categories_American Revolutionary War,...,categories_Transportation,categories_Travel,categories_Trivia,categories_Video Game Theme,categories_Vietnam War,categories_Wargame,categories_Word Game,categories_World War I,categories_World War II,categories_Zombies
0,174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df['list'] = df[df.columns[3:]].values.tolist()
new_df = df[['description', 'list']].copy()
new_df.head()

Unnamed: 0,description,list
0,Gloomhaven is a game of Euro-inspired tactica...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Pandemic Legacy is a co-operative campaign gam...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Through the Ages: A New Story of Civilization ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,"In the 2400s, mankind begins to terraform the ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"""Now the trumpet summons us again, not as a ca...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.description = dataframe.description
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.description)

    def __getitem__(self, index):
        description = str(self.description[index])
        description = " ".join(description.split())

        inputs = self.tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [15]:

# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (17146, 2)
TRAIN Dataset: (13717, 2)
TEST Dataset: (3429, 2)


In [16]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [17]:

# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 83)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [18]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [19]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [20]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [21]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ValueError: Target size (torch.Size([8, 83])) must be the same as input size (torch.Size([8, 6]))

# Appendix

In [6]:
raw_features = df.description
labels = df.iloc[:, 3:]
print(raw_features.shape)
print(labels.shape)

(17146,)
(17146, 83)


In [7]:
# Need the tokenized descriptions as entire strings, not lists of tokens
def stringify(description):
    
    return " ".join(description)

pool = multiprocessing.Pool(multiprocessing.cpu_count()) # 

preprocessed_description_strings = pool.map(stringify, tokenized_descriptions)

pool.close()

In [8]:
# Example cleaned string
preprocessed_description_strings[0]

'gloomhaven game euro inspired tactical combat persistent world shifting motives players take role wandering adventurer special set skills reasons traveling dark corner world players must work together necessity clear menacing dungeons forgotten ruins process enhance abilities experience loot discover new locations explore plunder expand ever branching story fueled decisions make game persistent changing world ideally played many game sessions scenario players make decisions determine story continues kind like choose adventure book playing scenario cooperative affair players fight automated monsters using innovative card system determine order play player turn turn player chooses two cards play hand number top card determines initiative round card also top bottom power player turn initiative order determine whether use top power one card bottom power vice versa players must careful though time permanently lose cards hands take long clear dungeon may end exhausted forced retreat'

In [10]:
# BOW unigrams
count_vectorizer = CountVectorizer(lowercase=False, ngram_range =(1,1), max_df = .5, min_df = 100)
unigram_bow_corpus = count_vectorizer.fit_transform(preprocessed_description_strings)
unigram_bow_corpus

<17146x2261 sparse matrix of type '<class 'numpy.int64'>'
	with 911393 stored elements in Compressed Sparse Row format>

In [11]:
# BOW unigrams & bigrams
count_vectorizer = CountVectorizer(lowercase=False, ngram_range =(1,2), max_df = .5, min_df = 100)
uni_and_bigram_bow_corpus = count_vectorizer.fit_transform(preprocessed_description_strings)
uni_and_bigram_bow_corpus

<17146x2773 sparse matrix of type '<class 'numpy.int64'>'
	with 1026904 stored elements in Compressed Sparse Row format>

In [12]:
# TFIDF unigrams
tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range =(1,1), max_df = .5, min_df = 100)
unigram_tfidf_corpus = tfidf_vectorizer.fit_transform(preprocessed_description_strings)
unigram_tfidf_corpus

<17146x2261 sparse matrix of type '<class 'numpy.float64'>'
	with 911393 stored elements in Compressed Sparse Row format>

In [13]:
# TFIDF unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range =(1,2), max_df = .5, min_df = 100)
uni_and_bigram_tfidf_corpus = tfidf_vectorizer.fit_transform(preprocessed_description_strings)
uni_and_bigram_tfidf_corpus

<17146x2773 sparse matrix of type '<class 'numpy.float64'>'
	with 1026904 stored elements in Compressed Sparse Row format>

In [328]:
# Transforming to Pandas Series so index slicing works
preprocessed_descriptions = pd.Series(preprocessed_description_strings)

In [329]:
train_texts = preprocessed_descriptions[train_indices]
y_train = labels.iloc[train_indices, :]

test_texts = preprocessed_descriptions[test_indices]
y_test = labels.iloc[test_indices, :]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
max_length = max([len(s.split()) for s in train_texts])

In [330]:
encoded_train_texts = tokenizer.texts_to_sequences(train_texts)    
X_train = pad_sequences(encoded_train_texts, maxlen=max_length, padding='post')

encoded_test_texts = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(encoded_test_texts, maxlen=max_length, padding='post')

In [331]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

53868
