In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install texthero

Collecting texthero
  Downloading texthero-1.1.0-py3-none-any.whl (24 kB)
Collecting nltk>=3.3
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 11.2 MB/s 
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 72.4 MB/s 
Installing collected packages: unidecode, nltk, texthero
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2 texthero-1.1.0 unidecode-1.2.0


In [3]:
import os

import numpy as np
import pandas as pd

import dill
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from tqdm import tqdm
import time

import gensim
from gensim.models import Word2Vec
import hashlib
import nltk

import texthero as hero
from texthero import preprocessing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
input_path = '/content/drive/MyDrive/機械学習/Competitions/Signate/医学論文の自動仕分けチャレンジ/input'

In [6]:
all_df = pd.read_feather(os.path.join(input_path,'all_clean_df.feather'))
all_df.head()

Unnamed: 0,id,title,abstract,judgement,clean_title,clean_abstract
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0.0,one year age changes mri brain volumes older...,longitudinal studies indicate declines cogni...
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0.0,supportive csf biomarker evidence enhance na...,present study undertaken validate measurem...
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0.0,occurrence basal ganglia germ cell tumors wit...,objective report case series basal ganglia...
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0.0,new developments diagnosis therapy crohn d...,etiology pathogenesis idiopathic chronic in...
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0.0,prolonged shedding sars cov elderly liver t...,


In [7]:
#ベクトル化
vec_tfidf = TfidfVectorizer(min_df=20)
abstract_tfidf = vec_tfidf.fit_transform(all_df['clean_abstract'].values)

In [10]:
start = time.time()

sentences = []
print("Parsing sentences from training set ...")

#Loop over each article.
for review in tqdm(all_df['clean_abstract']):
  try:
    #split a review into parsed sentence
    result = review
    h = result.split(" ")
    h = list(filter(("").__ne__,h))
    sentences.append(h)
  except:
    continue

num_features = 200
min_word_count = 20
num_workers = 1
context = 10
downsampling = 1e-3
seed = 42

hashfxn = lambda x: int(hashlib.md5(str(x).encode()).hexdigest(),16)

print("Training Word2Vec model...")
#Train Word2Vec model
model = Word2Vec(sentences,workers=num_workers,hs=0,sg=1,negative=10,iter=5,size=num_features,min_count=min_word_count,hashfxn=hashfxn,window=context,sample=downsampling,seed=seed)
model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(context) + "context_len2alldata"
model.init_sims(replace=True)
endmodeltime = time.time()

print("time: ",endmodeltime-start)

Parsing sentences from training set ...


100%|██████████| 67979/67979 [00:02<00:00, 32552.68it/s]


Training Word2Vec model...
time:  1095.9276459217072


In [14]:
tqdm.pandas()

def des_to_mean_vec(text):
  try:
    text = text.strip()
    text_ls = [s for s in text.split(' ') if '' != s]
    return np.mean([model.wv[word] for word in text_ls if word in model.wv.index2word],axis=0)
  except:
    return np.nan

df = all_df['clean_abstract'].progress_apply(lambda x: des_to_mean_vec(x))
df.head()


Mean of empty slice.

100%|██████████| 67979/67979 [08:24<00:00, 134.76it/s]


0    [-0.023012972, -0.04259936, 0.011505857, 0.006...
1    [-0.02108279, -0.033285636, -0.0022294275, 0.0...
2    [0.02161513, -0.018908055, -0.012818932, 0.016...
3    [0.024896286, 0.006013228, -0.013790725, -0.00...
4                                                  NaN
Name: clean_abstract, dtype: object

In [15]:
abstract_df = pd.DataFrame()
for num in tqdm(list(range(0,68000,1000))):
  abstract_df = pd.concat([abstract_df,df.iloc[num:num+1000].apply(pd.Series)])
abstract_df.columns = ['w2v_abstract_' + str(col) for col in abstract_df.columns]
abstract_df.head()

100%|██████████| 68/68 [00:17<00:00,  3.95it/s]


Unnamed: 0,w2v_abstract_0,w2v_abstract_1,w2v_abstract_2,w2v_abstract_3,w2v_abstract_4,w2v_abstract_5,w2v_abstract_6,w2v_abstract_7,w2v_abstract_8,w2v_abstract_9,w2v_abstract_10,w2v_abstract_11,w2v_abstract_12,w2v_abstract_13,w2v_abstract_14,w2v_abstract_15,w2v_abstract_16,w2v_abstract_17,w2v_abstract_18,w2v_abstract_19,w2v_abstract_20,w2v_abstract_21,w2v_abstract_22,w2v_abstract_23,w2v_abstract_24,w2v_abstract_25,w2v_abstract_26,w2v_abstract_27,w2v_abstract_28,w2v_abstract_29,w2v_abstract_30,w2v_abstract_31,w2v_abstract_32,w2v_abstract_33,w2v_abstract_34,w2v_abstract_35,w2v_abstract_36,w2v_abstract_37,w2v_abstract_38,w2v_abstract_39,...,w2v_abstract_160,w2v_abstract_161,w2v_abstract_162,w2v_abstract_163,w2v_abstract_164,w2v_abstract_165,w2v_abstract_166,w2v_abstract_167,w2v_abstract_168,w2v_abstract_169,w2v_abstract_170,w2v_abstract_171,w2v_abstract_172,w2v_abstract_173,w2v_abstract_174,w2v_abstract_175,w2v_abstract_176,w2v_abstract_177,w2v_abstract_178,w2v_abstract_179,w2v_abstract_180,w2v_abstract_181,w2v_abstract_182,w2v_abstract_183,w2v_abstract_184,w2v_abstract_185,w2v_abstract_186,w2v_abstract_187,w2v_abstract_188,w2v_abstract_189,w2v_abstract_190,w2v_abstract_191,w2v_abstract_192,w2v_abstract_193,w2v_abstract_194,w2v_abstract_195,w2v_abstract_196,w2v_abstract_197,w2v_abstract_198,w2v_abstract_199
0,-0.023013,-0.042599,0.011506,0.006895,-0.00941,0.031182,-0.041036,-0.005212,-0.046214,0.019414,-0.025367,-0.0446,-0.008573,-0.012298,0.029186,0.061301,-0.082132,-0.026352,-0.054699,0.012613,-0.012189,0.01049,-0.003941,-0.025139,-0.008353,0.00864,0.041287,-0.027042,-0.017674,-0.065142,-0.014823,0.05236,-0.033039,0.054076,0.023901,-0.014773,0.028042,-0.11995,0.083028,0.006225,...,-0.017988,0.00095,-0.029232,-0.004621,0.018284,-0.020836,0.021267,-0.012337,-0.000791,-0.007993,-0.081959,-0.024361,0.004037,0.031299,0.009265,-0.053305,0.068005,0.042372,-0.028494,-0.04857,0.015736,0.003428,-0.033866,-0.011065,0.004296,0.01371,0.109741,-0.044698,-0.020394,0.042258,0.042389,0.002241,0.05108,-0.04521,0.075726,-0.0163,-0.007841,-0.066345,-0.020553,-0.010853
1,-0.021083,-0.033286,-0.002229,0.023917,0.017481,-0.001153,-0.050747,-0.011739,-0.059993,0.008844,-0.020285,-0.047472,-0.035521,-0.012243,0.004243,0.037987,-0.028822,-0.064587,-0.045144,0.012167,0.00659,-0.01651,-0.015556,-0.025427,0.002086,0.018336,0.040445,-0.054067,0.023478,-0.049787,-0.035006,0.015842,-0.026053,0.02154,0.028494,0.020125,0.03397,-0.083008,0.09122,-0.001653,...,-0.036936,-0.023023,-0.010904,-0.001715,-0.010857,-0.010272,-0.017272,0.008143,0.01127,-0.021379,-0.100166,-0.053968,0.038477,0.011331,0.035322,-0.039755,0.079552,0.029395,-0.027743,-0.080629,-0.01856,0.033701,-0.00895,-0.011527,-0.002784,0.02061,0.085133,-0.017299,0.001871,0.045351,0.039979,0.013887,0.011075,-0.030538,0.050568,-0.009624,0.005508,-0.079094,-0.022793,-0.029971
2,0.021615,-0.018908,-0.012819,0.016717,0.002814,0.010401,-0.027952,-0.023355,-0.028507,0.019422,-0.010451,-0.070652,-0.023796,0.008722,0.013827,0.074003,-0.010357,-0.065145,-0.06436,0.024533,-0.007694,0.022154,-0.025458,-0.052745,0.02017,0.018314,0.050832,-0.03046,-0.029347,-0.023778,-0.009353,0.037271,-0.05937,0.033474,-0.000862,-0.018556,0.046893,-0.112481,0.076221,-0.000324,...,-0.023743,-0.041498,-0.032681,-0.0216,0.048763,-0.042663,0.006216,0.007775,0.007423,-0.02614,-0.071342,-0.025271,0.000354,0.031934,-0.007404,-0.044989,0.040241,0.008072,-0.03782,-0.06559,0.008747,0.000132,-0.038268,0.022523,-0.009754,0.026615,0.101143,-0.045741,-0.016194,0.048008,0.025118,0.024208,0.037358,-0.028075,0.067678,-0.019916,-0.013342,-0.094009,-0.016633,-0.016735
3,0.024896,0.006013,-0.013791,-0.005149,0.042205,-0.023721,-0.024882,-0.031942,-0.041438,-0.016374,0.001719,-0.033245,-0.022288,0.012296,-0.004988,0.051555,-0.027128,-0.056579,-0.037614,-0.01043,0.015731,0.011539,-0.043867,-0.080254,0.033811,-0.003464,0.033126,-0.010617,0.005938,-0.013493,-0.030237,-0.008737,-0.0449,0.029685,0.009862,-0.054679,0.034529,-0.093524,0.064696,-0.005196,...,-0.020288,-0.030634,-0.007441,-0.024897,0.048567,-0.015341,-0.010957,0.003398,0.027266,-0.025162,-0.095748,-0.007347,0.017355,0.022729,-0.041823,-0.048197,0.027949,0.021046,-0.033326,-0.100764,-0.005562,0.056701,-0.02469,0.016001,-0.012373,0.018511,0.087387,-0.023725,0.003499,0.044503,0.039774,0.035051,0.010694,-0.008448,0.05645,0.007246,-0.013982,-0.11029,1.6e-05,-0.000846
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
abstract_df.shape

(67979, 200)

In [17]:
abstract_df.to_feather(os.path.join(input_path,'abstract_df.feather'))

In [18]:
file = os.path.join(input_path,'w2v_model_abstract.dill')
dill.dump(model,open(file,'wb'))