In [1]:
# import all necessary libraries
import json
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords, stem_text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import itertools
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df_filter = pd.read_excel('/Users/itoshiki/Documents/nlp_lab/general/filter.xlsx')

In [3]:
data = df_filter['Radiology text']

In [4]:
def clean_txt(txt):
    new_txt = txt.replace('\n',' ')
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation,remove_stopwords, stem_text]
    words = preprocessing.preprocess_string(txt.lower(), CUSTOM_FILTERS)
    return words

In [5]:
new_data = list()

In [6]:
for d in data:
    new_l = clean_txt(str(d))
    if new_l:
        new_data.append(new_l)

In [7]:
tagged_tr = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(new_data)]

In [16]:
model = Doc2Vec(
    dm=0, 
    vector_size=300, 
    negative=5, 
    min_count=1, 
    alpha=0.065, 
    min_alpha=0.065
)

In [17]:
model.build_vocab(tagged_tr)

In [18]:
epochs = range(30)
for epoch in epochs:
    print(f'Epoch {epoch+1}')
    model.train(tagged_tr,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
     
model.save('doc2vec_trained.model')

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30


In [2]:
def cpt_ext(txt):
    try:
        splited_list = txt.lower().split('\n\n')
        new_txt = splited_list[0]+" "+splited_list[1]
        new_txt = new_txt.replace('\n', ' ')
        return new_txt
    except:
        return 'N/A'


In [3]:
def clean_txt(txt):
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation,remove_stopwords, stem_text]
    words = preprocessing.preprocess_string(txt.lower(), CUSTOM_FILTERS)
    if not words:
        return 'N/A'
    return words


In [4]:
# data cleaning for CPT
def df_clean_CPT(df_filter):
    # general cleaning empty entry
    df_return = df_filter.fillna('N/A')
    df_return = df_return[df_return['Radiology text']!='N/A']
    
    # specific cleaning empty entry in CPT_text
    # empty entries mean failed convertion during the extraction process
    df_return['CPT_text'] = df_return['Radiology text'].apply(cpt_ext)
    df_return = df_return[df_return['CPT_text']!='N/A']
    # transferring words to sentences
    df_return['CPT_text'] = df_return['CPT_text'].apply(clean_txt)
    df_return = df_return[df_return['CPT_text']!='N/A']
    return df_return


In [5]:
def load_data(filepath):
    # load data as dataframe
    df_filter = pd.read_excel(filepath)
    # filter all data without any empty data
    df_filter = df_filter.fillna('N/A')
    df_filter = df_filter[df_filter['Radiology text']!='N/A']
    df_filter = df_clean_CPT(df_filter)
    return df_filter


In [6]:
train_df = load_data('/Users/itoshiki/Documents/nlp/data/train.xlsx')


In [11]:
tagged_tr = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(train_df['CPT_text'])]

In [13]:
model = Doc2Vec(vector_size=300,
                window=5, 
                alpha=.025, 
                min_alpha=0.00025, 
                min_count=2, 
                dm=1, 
                workers=4)

In [14]:
model.build_vocab(tagged_tr)

In [15]:
model.corpus_count

313400

In [16]:
model.epochs

5

In [17]:
model

<gensim.models.doc2vec.Doc2Vec at 0x1384c8790>

In [None]:
epochs = range(100)
for epoch in epochs:
    print(f'Epoch {epoch+1}')
    model.train(tagged_tr,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.00025
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
     
model.save('math_lectures.model')