In [1]:
import pandas as pd


In [2]:
train_df = pd.read_csv("data/train_data.csv")
test_df = pd.read_csv("data/test_data.csv")

In [3]:
for df in [train_df, test_df]:
    df["keywords"] = df["keywords"].fillna("").astype(str)
    df["keywords_count"] = df["keywords"].str.split(", ").agg(len)
    df.loc[df["keywords"]=="", "keywords_count"] = 0

In [4]:
all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [5]:
# すべて小文字にして、ハイフンを半角スペースに変換



all_df["keywords"] = all_df["keywords"].str.lower().str.replace("-", " ")

# 複数形の単語リスト
plural_words = [
    "models", "networks", "embeddings", "graphs", "gans", "rnns", "parameters",
    "functions", "representations", "methods", "images", "tests", "algorithms", "names",
    "records", "attributes", "coders", "recommendations", "orders", "gradients", "tasks",
    "machines", "operations", "examples"
]

keyword_list = []
for idx in all_df.index:
    if len(all_df.loc[idx, "keywords"]) == 0:
        continue
    tmp_list = all_df.loc[idx, "keywords"].split(", ")
    for plural_word in plural_words:
        tmp_list = [word.replace(plural_word, plural_word[:-1]) for word in tmp_list]
    keyword_list += tmp_list

In [6]:
import collections 
from pprint import pprint
counter = collections.Counter(keyword_list)

In [7]:
pprint(counter.most_common(20))

[('deep learning', 1129),
 ('reinforcement learning', 922),
 ('representation learning', 420),
 ('graph neural network', 350),
 ('neural network', 332),
 ('generative model', 297),
 ('meta learning', 283),
 ('generalization', 240),
 ('unsupervised learning', 228),
 ('robustness', 218),
 ('generative adversarial network', 216),
 ('gan', 213),
 ('optimization', 207),
 ('natural language processing', 206),
 ('transfer learning', 202),
 ('self supervised learning', 194),
 ('deep reinforcement learning', 187),
 ('interpretability', 184),
 ('adversarial example', 182),
 ('computer vision', 176)]


In [8]:
keywords_len_counter=collections.Counter(all_df["keywords_count"].to_list())

In [9]:
pprint(keywords_len_counter.most_common(20))

[(3, 3165),
 (4, 2675),
 (5, 1443),
 (2, 1311),
 (0, 1255),
 (6, 633),
 (7, 321),
 (1, 252),
 (8, 148),
 (9, 80),
 (11, 30),
 (10, 28),
 (12, 9),
 (13, 6),
 (15, 2),
 (21, 2),
 (14, 2),
 (18, 2),
 (24, 1),
 (22, 1)]


In [10]:
from lilac.features.nlp.word_vectorizers.w2v_vectorizer import W2VVectorizer
from lilac.features.nlp.text_vectorizers.word_vector_based_vectorizer import WordVectorBasedVectorizer


In [11]:
word_vectorizer=W2VVectorizer(10,42,sep=", ")
word_vectorizer.fit(all_df["keywords"])
word_vectorizer.transform("deep learning")

array([ 0.23905903,  0.44110057,  0.04697207,  1.0514789 ,  1.3433418 ,
       -0.31591403,  0.30191618, -1.2359105 , -1.5443531 ,  0.10688017],
      dtype=float32)

In [14]:
text_vectorizer=WordVectorBasedVectorizer("w2v",100,how_to_aggregate="mean",sep=", ")
vecs=text_vectorizer.fit_transform(all_df["keywords"])

Extracting required params in WordVectorizerFactory.


array([[ 0.04176531, -0.08907208,  0.01203408, ..., -0.07072501,
         0.00845501, -0.15382209],
       [ 0.0608058 , -0.12903522,  0.01032884, ..., -0.08803467,
         0.01297813, -0.19574483],
       [ 0.00524634, -0.00145287, -0.00056168, ..., -0.00181542,
         0.00522755,  0.00588716],
       ...,
       [ 0.02250574, -0.05659229,  0.01666991, ..., -0.04871108,
         0.01210373, -0.08806672],
       [ 0.03838874, -0.09399089,  0.01279359, ..., -0.06652151,
         0.01237313, -0.15337162],
       [ 0.0476392 , -0.10085097,  0.01195502, ..., -0.08317403,
         0.01206388, -0.16866226]])

In [56]:
train_df["keywords"]

0       generative, hierarchical, unsupervised, semisu...
1           NLU, word embeddings, representation learning
2                                                        
3       generative adversarial networks, differential ...
4       Generative Models, Latent representations, Pre...
                              ...                        
4969    Neural Processes, Deep Sets, Translation Equiv...
4970                margin, homogeneous, gradient descent
4971    adversarial examples, adversarial training, pr...
4972    Question Answering, Multi-Hop QA, Deep Learnin...
4973                                   federated learning
Name: keywords, Length: 4974, dtype: object