In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import os
import gc

import numpy as np
import pandas as pd

import dill
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import random

In [7]:
n_fold = 5
SEED = 42

def set_seed(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

set_seed(SEED)

In [9]:
input_path = '/content/drive/MyDrive/機械学習/Competitions/Signate/医学論文の自動仕分けチャレンジ/input'

train_df = pd.read_csv(os.path.join(input_path,'train.csv'))
test_df = pd.read_csv(os.path.join(input_path,'test.csv'))
all_df = pd.concat([train_df,test_df]).reset_index(drop=True)
print(train_df.shape,test_df.shape,all_df.shape)

all_df.head()

(27145, 4) (40834, 3) (67979, 4)


Unnamed: 0,id,title,abstract,judgement
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0.0
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0.0
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0.0
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0.0
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0.0


In [13]:
'''
del train_df
del test_df
'''
gc.collect()

999

In [15]:
all_clean_df = pd.read_feather(os.path.join(input_path,'all_clean_df.feather'))
all_clean_df.head()

Unnamed: 0,id,title,abstract,judgement,clean_title,clean_abstract
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0.0,one year age changes mri brain volumes older...,longitudinal studies indicate declines cogni...
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0.0,supportive csf biomarker evidence enhance na...,present study undertaken validate measurem...
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0.0,occurrence basal ganglia germ cell tumors wit...,objective report case series basal ganglia...
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0.0,new developments diagnosis therapy crohn d...,etiology pathogenesis idiopathic chronic in...
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0.0,prolonged shedding sars cov elderly liver t...,


In [18]:
#ベクトル化
vec_tfidf = TfidfVectorizer(min_df=20)
abstract_df = vec_tfidf.fit_transform(all_clean_df['clean_abstract'].values)

kmeans = KMeans(n_clusters=20,random_state=SEED)
kmeans_abstract_df = pd.DataFrame(kmeans.fit_transform(abstract_df))
kmeans_abstract_df.columns = ['kmeans_tfidf_abstract_' + str(col) for col in kmeans_abstract_df.columns]
kmeans_abstract_df = pd.concat([all_df['id'],kmeans_abstract_df],axis=1)

In [20]:
kmeans_abstract_df.head()

Unnamed: 0,id,kmeans_tfidf_abstract_0,kmeans_tfidf_abstract_1,kmeans_tfidf_abstract_2,kmeans_tfidf_abstract_3,kmeans_tfidf_abstract_4,kmeans_tfidf_abstract_5,kmeans_tfidf_abstract_6,kmeans_tfidf_abstract_7,kmeans_tfidf_abstract_8,kmeans_tfidf_abstract_9,kmeans_tfidf_abstract_10,kmeans_tfidf_abstract_11,kmeans_tfidf_abstract_12,kmeans_tfidf_abstract_13,kmeans_tfidf_abstract_14,kmeans_tfidf_abstract_15,kmeans_tfidf_abstract_16,kmeans_tfidf_abstract_17,kmeans_tfidf_abstract_18,kmeans_tfidf_abstract_19
0,0,1.386608,1.003365,1.389208,0.992143,1.054013,1.016412,1.048969,1.396283,1.115732,1.095984,1.017924,1.037273,1.010913,0.960975,1.059318,1.056373,1.158939,1.051931,1.005981,1.112097
1,1,1.365999,1.006034,1.404026,0.994194,1.08357,1.009034,1.042399,1.398144,1.099274,1.094133,1.016916,1.033461,1.027384,0.967347,0.864663,1.037493,1.156722,1.038672,0.984402,1.129536
2,2,1.411953,1.005403,1.370214,0.987388,1.064798,0.995703,1.02605,1.359641,1.120877,1.068689,0.998698,1.029837,0.977948,0.991881,1.066061,1.044355,1.140512,1.061001,1.013964,1.131192
3,3,1.391809,1.005049,1.371373,0.987164,1.077094,1.002286,1.036391,1.380212,1.112608,1.073215,0.941248,1.021423,1.002705,1.005494,1.062657,1.049682,1.131384,1.052393,1.005406,1.14027
4,4,1.0,0.219747,1.0,0.06837,0.457727,0.277664,0.35377,0.983037,0.529567,0.477838,0.259298,0.323757,0.293467,0.247502,0.414487,0.404264,0.609567,0.394354,0.243737,0.591165


In [21]:
kmeans_abstract_df.to_feather(os.path.join(input_path,'kmeans_tfidf_abstract_df.feather'))