In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
base_path = '/content/drive/My Drive/新生訓練/新訓HW3'

## Text Preprocessing

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
from ast import literal_eval
import pandas as pd
import numpy as np
import re

In [6]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)  # str to list
    return data

In [7]:
train = read_data(base_path+'/train.tsv')
validation = read_data(base_path+'/validation.tsv')
test = pd.read_csv(base_path+'/test.tsv', sep='\t')

In [8]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [9]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

#### Task 1 (TextPrepare).
#### Implement the function text_prepare following the instructions.text_prepare

In [10]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """

    # lowercase text
    text = text.lower()
    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)
    # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub(BAD_SYMBOLS_RE, '', text)
    # delete stopwords from
    word_tokens = word_tokenize(text)
    filtered_sentence = [] 
    for w in word_tokens:
        if w not in STOPWORDS:
            filtered_sentence.append(w)
            
    return ' '.join(filtered_sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            print(text_prepare(ex))
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [12]:
print(test_text_prepare())

Basic tests are passed.


#### Task 2 (WordsTagsCount)
#### Find 3 most popular tags and 3 most popular words in the train data 

In [13]:
# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}

#words_counts are dictionaries like {'some_word_or_tag': frequency}. 

str_flatten =' '.join(X_train)
list_flatten = str_flatten.split()
from collections import Counter
counter = Counter(list_flatten)
words_counts = {token:count for token,count in counter.items()}

list_tags = []
for lst in y_train:
    for ele in lst:
        list_tags.append(ele)
counter = Counter(list_tags)
tags_counts = {token:count for token,count in counter.items()}


In [14]:
list_tags[:10]

['r',
 'php',
 'mysql',
 'c#',
 'javascript',
 'jquery',
 'java',
 'ruby-on-rails',
 'ruby',
 'ruby-on-rails-3']

In [15]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_tags)
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_words)

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('to', 34778), ('in', 30322), ('a', 24056)]


## Transforming text to a vector
#### Machine Learning algorithms work with numeric data and we cannot use the provided text data "as is". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.

### Bag of words
#### One of the well-known approaches is a bag-of-words representation. To create this transformation, follow the steps:



1.   Find N most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.
2.  For each title in the corpora create a zero vector with the dimension equals to N.
3.  For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.



In [16]:
# Implement the described encoding in the function my_bag_of_words with the size of the dictionary equals to 5000. 
DICT_SIZE = 5000
common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE] # Training Data中前5000個最常見的字及出現次數
INDEX_TO_WORDS = {index:word for index, word in enumerate(sorted(word for word,count in common_words))} # 把words做編號
WORDS_TO_INDEX = {word:index for index, word in enumerate(sorted(word for word,count in common_words))} # 反過來
ALL_WORDS = WORDS_TO_INDEX.keys() # 取得words

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    text_list = text.split()
    #print(text_list)
    for word in text_list:
        if word in words_to_index:
            result_vector[words_to_index[word]] = 1
    return result_vector

In [17]:
from itertools import islice

def take(n, iterable):
    return list(islice(iterable, n))

print(take(10, INDEX_TO_WORDS.items()))
print(take(10, WORDS_TO_INDEX.items()))

[(0, '!='), (1, '"'), (2, '"Cannot'), (3, '"Could'), (4, '"Error:'), (5, '"Failed'), (6, '"Invalid'), (7, '"No'), (8, '"Object'), (9, '"The')]
[('!=', 0), ('"', 1), ('"Cannot', 2), ('"Could', 3), ('"Error:', 4), ('"Failed', 5), ('"Invalid', 6), ('"No', 7), ('"Object', 8), ('"The', 9)]


In [18]:
def test_my_bag_of_words():
    words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
    examples = ['hi how are you']
    answers = [[1, 1, 0, 1]]
    for ex, ans in zip(examples, answers):
        if (my_bag_of_words(ex, words_to_index, 4) != ans).any():
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [19]:
print(test_my_bag_of_words())

Basic tests are passed.


# 把my_bag_of_words應用到training data上

In [20]:
import scipy
from scipy import sparse as sp_sparse

In [21]:
# 1. 轉vector
# 2. 轉稀疏矩陣(csr_matrix)
# 3. 拼接(vstack)
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


### Task 3 (BagOfWords). 
#### For the 11th row in X_train_mybag find how many non-zero elements it has.

In [22]:
row = X_train_mybag[10].toarray()[0]
non_zero_elements_count = (row > 0).sum()
print(non_zero_elements_count)

9


### TF-IDF
可自己寫或CALL套件

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result

    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)') #(\S+)是為了保留c++跟c#

    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [25]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

check whether you have c++ or c# in your vocabulary, as they are obviously important tokens in our tags prediction task:

In [26]:
if 'c++' in tfidf_vocab.keys():
  print('c++')
if 'c#' in tfidf_vocab.keys():
  print('c#')

c++
c#


If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\S+)' regexp as a token_pattern in the constructor of the vectorizer.

## MultiLabel classifier


*   compare the quality of the bag-of-words and TF-IDF approaches 
- 怎麼設計都可以



In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

In [28]:
# 將tag做one-hot
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [29]:
print(y_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [30]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier

In [31]:
def train_classifier(X_train, y_train):
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    clf = OneVsRestClassifier(RidgeClassifier(normalize=True))
    clf.fit(X_train, y_train)
    return clf

In [32]:
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

In [33]:
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)

In [34]:
print(mlb.inverse_transform(y_val_predicted_labels_tfidf))
print(mlb.inverse_transform(y_val))

[(), (), ('ruby-on-rails',), (), (), (), (), ('python',), ('javascript', 'jquery'), ('hibernate', 'java'), ('c#',), (), ('php',), (), (), ('wpf',), (), ('javascript', 'twitter-bootstrap'), (), (), ('javascript', 'php'), ('python',), (), ('android',), (), ('java',), ('django', 'python'), ('java',), (), ('java',), ('javascript',), ('java',), (), (), ('c',), (), ('r',), ('java',), ('c#', 'linq'), ('javascript',), (), ('objective-c',), (), ('c#', 'linq'), (), (), ('php',), ('java',), (), (), ('php',), (), (), ('php',), (), ('django',), ('python',), ('c++',), ('java',), (), ('ruby-on-rails',), ('javascript',), (), (), ('java',), ('c++',), (), (), ('javascript',), ('c#', 'linq'), ('pandas', 'python'), ('javascript',), ('css', 'javascript'), (), ('java',), ('php',), ('sql',), ('c#',), (), ('java',), (), ('android', 'java'), ('php',), ('ajax', 'php'), (), ('google-maps', 'javascript'), (), ('mysql',), ('java',), ('java', 'spring'), (), ('sorting',), (), (), ('java',), (), ('c#', 'wpf'), ('asp.

In [35]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	Why odbc_exec always fail?
True labels:	php,sql
Predicted labels:	


Title:	Access a base classes variable from within a child class
True labels:	javascript
Predicted labels:	


Title:	Content-Type "application/json" not required in rails
True labels:	ruby,ruby-on-rails
Predicted labels:	ruby-on-rails




#### Evaluation


*   accuracy
*   F1-score macro/micro
* Precision macro/micro



In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [37]:
def print_evaluation_scores(y_val, predicted):
    print(accuracy_score(y_val, predicted))
    print(f1_score(y_val, predicted, average='weighted'))
    print(average_precision_score(y_val, predicted))

In [38]:
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

Bag-of-words
0.1636
0.3564858143175229
0.10489445524252218
Tfidf
0.23706666666666668
0.4819196928016005
0.20358877147781837


## Word2Vec

#### ex: mean(word embeddings) --> MLP
#### ex: word embeddings --> LSTM
#### BoW做的同樣事情再做一次

In [39]:
train = read_data(base_path+'/train.tsv')
validation = read_data(base_path+'/validation.tsv')
test = pd.read_csv(base_path+'/test.tsv', sep='\t')

In [40]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [41]:
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

In [42]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [43]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [44]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [45]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [46]:
train["text_clean"] = train["title"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
validation["text_clean"] = validation["title"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords))
train.head()

Unnamed: 0,title,tags,text_clean
0,How to draw a stacked dotplot in R?,[r],draw stacked dotplot r
1,mysql select all records where a datetime fiel...,"[php, mysql]",mysql select record datetime field le specifie...
2,How to terminate windows phone 8.1 app,[c#],terminate window phone 81 app
3,get current time in a specific country via jquery,"[javascript, jquery]",get current time specific country via jquery
4,Configuring Tomcat to Use SSL,[java],configuring tomcat use ssl


In [47]:
## split dataset
#dtf_train, dtf_test = model_selection.train_test_split(result, test_size=0.3)
dtf_train = train
dtf_test = validation
## get target
y_train = dtf_train["tags"].values
y_test = dtf_test["tags"].values

In [48]:
corpus = dtf_train["text_clean"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
   lst_words = string.split()
   lst_grams = [" ".join(lst_words[i:i+1]) 
               for i in range(0, len(lst_words), 1)]
   lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, 
                 delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
            delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)

In [49]:
## tokenize text
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                     oov_token="NaN", 
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lst_corpus)
dic_vocabulary = tokenizer.word_index
## create sequence
lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
## padding sequence
X_train = kprocessing.sequence.pad_sequences(lst_text2seq, 
                    maxlen=15, padding="post", truncating="post")

In [50]:
i = 0

## list of text: ["I like this", ...]
len_txt = len(dtf_train["text_clean"].iloc[i].split())
print("from: ", dtf_train["text_clean"].iloc[i], "| len:", len_txt)

## sequence of token ids: [[1, 2, 3], ...]
len_tokens = len(X_train[i])
print("to: ", X_train[i], "| len:", len(X_train[i]))

## vocabulary: {"I":1, "like":2, "this":3, ...}
print("check: ", dtf_train["text_clean"].iloc[i].split()[0], 
      " -- idx in vocabulary -->", 
      dic_vocabulary[dtf_train["text_clean"].iloc[i].split()[0]])

print("vocabulary: ", dict(list(dic_vocabulary.items())[0:5]), "... (padding element, 0)")


from:  draw stacked dotplot r | len: 4
to:  [  572  2614 11675   101     0     0     0     0     0     0     0     0
     0     0     0] | len: 15
check:  draw  -- idx in vocabulary --> 572
vocabulary:  {'NaN': 1, 'using': 2, 'c': 3, 'file': 4, 'java': 5} ... (padding element, 0)


In [51]:
corpus = dtf_test["text_clean"]

## create list of n-grams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) for i in range(0, 
                 len(lst_words), 1)]
    lst_corpus.append(lst_grams)
    
## detect common bigrams and trigrams using the fitted detectors
lst_corpus = list(bigrams_detector[lst_corpus])
lst_corpus = list(trigrams_detector[lst_corpus])
## text to sequence with the fitted tokenizer
lst_text2seq = tokenizer.texts_to_sequences(lst_corpus)

## padding sequence
X_val = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=15,
             padding="post", truncating="post")

In [52]:
## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, 300))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  nlp[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

In [53]:
word = "data"
print("dic[word]:", dic_vocabulary[word], "|idx")
print("embeddings[idx]:", embeddings[dic_vocabulary[word]].shape, 
      "|vector")

dic[word]: 15 |idx
embeddings[idx]: (300,) |vector


In [54]:
print(y_train)

[list(['r']) list(['php', 'mysql']) list(['c#']) ...
 list(['python', 'datetime', 'pandas']) list(['javascript', 'jquery'])
 list(['java', 'list', 'generics'])]


In [55]:
import itertools
temp = []
for i in y_train:
  for j in i:
    temp.append(j)
unique_y_train = set(temp)
y_train_dic = {word:index for index, word in enumerate(unique_y_train)}
y_train_onehot = []
for i in range(len(y_train)):
  y_train_onehot.append([0] * 100)
  for j in y_train[i]:
    y_train_onehot[i][y_train_dic[j]] = 1
y_train_onehot = np.array(y_train_onehot)

In [56]:
print(y_train[:10])

[list(['r']) list(['php', 'mysql']) list(['c#'])
 list(['javascript', 'jquery']) list(['java']) list(['ruby-on-rails'])
 list(['ruby', 'ruby-on-rails-3', 'json']) list(['ruby'])
 list(['java', 'spring', 'spring-mvc']) list(['php', 'codeigniter'])]


In [57]:
x = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

In [58]:
in_dim = (x.shape[1], x.shape[2])
out_dim = y_train_onehot.shape[1]

In [59]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(64, input_shape=in_dim, activation="relu"))
model.add(Dense(out_dim))
model.compile(loss="mse", optimizer="adam")
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                16896     
_________________________________________________________________
dense (Dense)                (None, 100)               6500      
Total params: 23,396
Trainable params: 23,396
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.fit(x, y_train_onehot, epochs=10, batch_size=12, verbose=0)

<keras.callbacks.History at 0x7fa63cf6f190>

In [61]:
import itertools
temp = []
for i in y_test:
  for j in i:
    temp.append(j)
unique_y_test = set(temp)
y_test_dic = {word:index for index, word in enumerate(unique_y_test)}
y_test_onehot = []
for i in range(len(y_test)):
  y_test_onehot.append([0] * 100)
  for j in y_test[i]:
    y_test_onehot[i][y_test_dic[j]] = 1
y_test_onehot = np.array(y_test_onehot)

In [62]:
from sklearn.metrics import mean_squared_error

xtest = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
ypred = model.predict(xtest)
print("y1 MSE:%.4f" % mean_squared_error(y_test_onehot[:,0], ypred[:,0]))
print("y2 MSE:%.4f" % mean_squared_error(y_test_onehot[:,1], ypred[:,1]))

y1 MSE:0.0173
y2 MSE:0.0197


In [63]:
max_3 = sorted(ypred[1],reverse=True)[:4]
for i in max_3:
  print(list(ypred[1]).index(i))
max_3 = sorted(y_test_onehot[1],reverse=True)[:4]
x = y_test_onehot[1][:]
for i in max_3:
  print(list(x).index(i))
  x = np.delete(x, list(x).index(i))


47
60
17
71
47
0
0
0


In [65]:
y_test_f = []
y_predict = []
for index in range(len(y_test_onehot)):
  y_test_f.append([i for i in range(len(y_test_onehot[index])) if y_test_onehot[index][i] == 1])
  
  max_3 = sorted(ypred[index],reverse=True)[:len(y_test_f[index])]
  y_elem = []
  for i in max_3:
    y_elem.append(list(ypred[index]).index(i))
  y_predict.append((sorted(y_elem)))

In [66]:
index = 0
y_test_f.append([i for i in range(len(y_test_onehot[index])) if y_test_onehot[index][i] == 1])

max_3 = sorted(ypred[index],reverse=True)[:len(y_test_f[0])]
for i in max_3:
  print(list(ypred[index]).index(i))

71
17


In [67]:
print(y_test_f)
print(y_predict)

[[37, 64], [47], [12, 88], [54, 88], [12, 86, 88], [1, 38, 73, 74], [17], [0, 32], [47, 49, 60], [46, 71], [10, 17, 21], [13, 17], [64], [71], [13, 17], [13, 17], [16, 32], [0, 94], [71, 84], [12, 88], [5, 47, 64], [16, 32], [16, 17, 29], [47, 84], [83], [71], [0], [42, 71], [12], [71], [43, 47, 60], [71, 84], [74], [10, 17, 49], [11, 41], [64], [62, 83], [37, 71], [17, 29], [47, 49], [64], [1, 38, 91], [64], [10, 17, 29], [17, 59, 92], [47], [36, 64], [71], [60, 64, 70], [41], [49, 64], [71], [10, 17, 90], [64], [71, 96], [0, 32], [32], [74], [50, 71], [12], [12, 51], [14, 47, 60], [32, 84], [17], [3, 37, 46, 71], [74, 79], [38, 52], [74], [47, 60], [13, 17, 29], [15, 32], [47, 49], [14, 47, 49, 97], [12, 20], [48, 71], [3, 64], [3, 42, 47], [17, 80, 90], [71], [71], [32, 81, 98], [71, 84], [64], [43, 47, 60, 64], [83], [14, 47, 49, 60, 82], [71, 75], [0, 3, 32], [71], [71], [57, 64], [52, 53, 93], [0, 32], [32, 34], [71], [47, 74], [13, 17, 85], [17, 66], [47], [64, 70], [74], [10, 1

# Evaluation

In [68]:
y_predict_onehot = []
for i in y_predict:
  y_predict_onehot.append([0] * 100)
  for j in i:
    y_predict_onehot[len(y_predict_onehot)-1][j] = 1
y_predict_onehot = np.array(y_predict_onehot)

In [69]:
print(y_predict_onehot)
print(y_test_onehot)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


In [70]:
from sklearn.preprocessing import MultiLabelBinarizer
print(accuracy_score(y_test_onehot, y_predict_onehot))
print(f1_score(y_test_onehot, y_predict_onehot, average='weighted'))
print(average_precision_score(y_test_onehot, y_predict_onehot))

0.12183333333333334
0.18388500399574287
0.024973752166611428
