In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

import pickle
import os

import progressbar
from sys import getsizeof

In [211]:
import numpy as np

## Load

In [2]:
def read_data():
    data_train = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
    data_test = pd.read_csv('drugsComTest_raw.tsv', sep='\t')

    return data_train, data_test

data_train,data_test=read_data()

In [3]:
def load_data():
    cv_train=pickle.load(open('cv_train.pkl','rb'))
    cv_test=pickle.load(open('cv_test.pkl','rb'))
    y_train=pickle.load(open('y_train.pkl','rb'))
    y_test=pickle.load(open('y_test.pkl','rb'))
    
    return cv_train,cv_test,y_train,y_test

In [4]:
cv_train,cv_test,y_train,y_test=load_data()

# Train Test dataframes

In [3]:
def df_train_test():
    train_df = data_train[['condition', 'review']]
    test_df = data_test[['condition', 'review']]
    train_df = train_df.dropna()
    test_df = test_df.dropna()

    def resub(review):
        review = re.sub(r"&#039;", "'", review)
        return review

    train_df.review = train_df.review.apply(resub)
    test_df.review = test_df.review.apply(resub)

    train_df = train_df[~train_df.condition.str.contains('</span>')]
    test_df = test_df[~test_df.condition.str.contains('</span>')]

    return train_df, test_df

In [4]:
train_df,test_df=df_train_test()

# Reduce number of classification items

In [5]:
def reduce_conditions(value):
    cond = train_df.condition.value_counts() > value

    def g(condition):
        if cond[condition]:
            return condition
        else:
            return 'other'

    train_df['condcopy'] = train_df['condition'].apply(g)

    s = set(train_df['condcopy'])
    in_s = test_df['condition'].isin(s)
    test_df['condcopy'] = test_df['condition']
    test_df['condcopy'][~in_s] = 'other'

    len_train = len(set(train_df.condcopy))
    len_test = len(set(test_df.condcopy))

    other_train = train_df.condcopy.value_counts()['other'] / train_df.shape[0] * 100
    other_test = test_df.condcopy.value_counts()['other'] / test_df.shape[0] * 100
    print('Nr conditions Train: ', len_train, '\nNr conditions Test: ', len_test)
    print('Percentage "other", Train: ', other_train, '%')
    print('Percentate "other", Test: ', other_test, '%')

In [6]:
reduce_conditions(50)

Nr conditions Train:  225 
Nr conditions Test:  225
Percentage "other", Train:  3.524182121405911 %
Percentate "other", Test:  3.505639097744361 %


In [7]:
len(set(train_df.condcopy))

225

# NN implementation using embedding layer

## Preprocess data

### Review to Words

In [186]:
def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    review = re.sub(r"[^a-zA-Z0-9]", " ", review.lower()) # Convert to lower case
    words = review.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

### Preprocess data

In [184]:
!rm -r cache

In [185]:
cache_dir = os.path.join("./cache", "drugreview_analysis")  # where to store cache files
os.makedirs(cache_dir)  # ensure cache directory exists

In [188]:
def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        print('Training data:')
        words_train=[]
        for review in progressbar.progressbar(data_train):
            words_train.append(review_to_words(review))
        
        print('Test data:')
        words_test=[]
        for review in progressbar.progressbar(data_test):
            words_test.append(review_to_words(review))
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [289]:
train_X, test_X, train_y, test_y = preprocess_data(train_df.review, test_df.review, train_df.condcopy, test_df.condcopy)

Read preprocessed data from cache file: preprocessed_data.pkl


### Create Dictionary

In [190]:
from collections import Counter

In [279]:
def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    count=Counter()
    for sentence in data:
        count.update(sentence)
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    sorted_words = count.most_common()#this method sorts the words from the most frequent to the less frequent
    sorted_words=[word for word,_ in sorted_words]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'infrequent' labels and 'nowords'
        word_dict[word] = idx + 2                              
        
    return word_dict

In [213]:
train_X[1];

In [293]:
word_dict=build_dict(train_X)

In [301]:
word_dict;

In [199]:
data_dir = 'pytorch_data' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

In [296]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

### Convert to array of integers

In [283]:
lens=[]
for sentence in train_X:
    lens.append(len(sentence))

In [284]:
max(lens)

942

In [285]:
def convert_and_pad(word_dict, sentence,pad=1000):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for i, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[i]=word_dict[word]
        else:
            working_sentence[i]=INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data,pad=1000):
    result = []
    lengths = []
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [302]:
train_X=convert_and_pad_data(word_dict,train_X)
test_X=convert_and_pad_data(word_dict,test_X)

In [354]:
train_X_len=train_X[1]
test_X_len=test_X[1]

In [355]:
train_X=train_X[0]
test_X=test_X[0]

## Encoding of classes

In [338]:
classes=list(set(train_y))

In [341]:
len(classes)

225

In [343]:
type(train_y.values)

numpy.ndarray

In [345]:
def encode_y(y):
    dic={target:i for i,target in enumerate(classes)}
    
    y_new=[]
    for target in y:
        y_new.append(dic[target])
        
    return np.array(y_new),dic

In [346]:
train_y,dic_encode=encode_y(train_y)

## Save data

In [357]:
pickle.dump(train_X,open('pytorch_data/train_X.pkl','wb'))
pickle.dump(train_X_len,open('pytorch_data/train_X_len.pkl','wb'))
pickle.dump(train_y,open('pytorch_data/train_y.pkl','wb'))
pickle.dump(test_X,open('pytorch_data/test_X.pkl','wb'))
pickle.dump(test_X_len,open('pytorch_data/test_X_len.pkl','wb'))
pickle.dump(test_y,open('pytorch_data/test_y.pkl','wb'))

save the encoding dic for the target

In [359]:
pickle.dump(dic_encode,open('pytorch_data/dic_encode.pkl','wb'))

### to pandas and save csv

In [358]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

## Define class

In [129]:
import torch.nn.functional as F

In [585]:
class Classifier1L(nn.Module): #one hidden layer
    
    def __init__(self,embedding_dim,vocab_size,hidden_dim,output_dim):
        
        super(Classifier1L, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1=nn.Linear(embedding_dim,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,output_dim)
        #Use Cross-Entropy loss requires not calculating probability from the network directly. The output of the last layer is fed to the loss function directly
        
    def forward(self,x): #x is sequence of words (mapped to integers)
        x=x.t()
        lengths = x[0,:] #torch.shape=(batch_size)
        reviews = x[1:,:] #torch.shape=(pad,batch_size)
        out=self.embedding(reviews) #torch.shape=(pad,batch_size,embedding_dim)
        out=torch.cat([torch.sum(out[:l,i,:],0).reshape(1,-1) for i,l in enumerate(lengths)],0)  
        out=F.relu(self.fc1(out))
        out=self.fc2(out)

        return out

## Train function

In [501]:
def train(model, train_loader, epochs, criterion, optimizer, device):
    
    for epoch in range(1, epochs + 1):
        model.train() # Make sure that the model is in training mode.

        total_loss = 0

        for batch in train_loader:
            # get data
            batch_x, batch_y = batch

            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()

            # get predictions from model
            y_pred = model(batch_x)
        
            # perform backprop
            loss = criterion(y_pred, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()

        print("Epoch: {}, Loss: {}".format(epoch, total_loss / len(train_loader)))

In [599]:
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None,nrows=2000)

In [600]:
train_sample_y = torch.from_numpy(train_sample[[0]].values).squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

In [606]:
# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=100)

In [413]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [607]:
model=Classifier1L(100,5000,100,len(classes))

In [608]:
optimizer = optim.Adam(model.parameters())

In [609]:
loss_fn = torch.nn.CrossEntropyLoss()

In [610]:
train(model, train_sample_dl, 5, loss_fn,optimizer, device)

Epoch: 1, Loss: 5.310545349121094


KeyboardInterrupt: 

# Sagemaker 
## Upload to S3 and Train

In [553]:
import logging
import boto3
from botocore.exceptions import ClientError

In [554]:
session = boto3.Session(profile_name='default') 

In [555]:
import sagemaker

sagemaker_session = sagemaker.Session(boto_session=session)

bucket = sagemaker_session.default_bucket()
prefix = 'drugs_pytorch/data'

role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)

In [556]:
role

'arn:aws:iam::113516067754:role/service-role/AmazonSageMaker-ExecutionRole-20200513T111076'

In [558]:
prefix = 'drugs_pytorch/data'

In [567]:
data_dir='data_s3'

In [565]:
ls pytorch_data/

dic_encode.pkl  test_X_len.pkl  train_X.pkl      train_y.pkl
test_X.pkl      test_y.pkl      train_X_len.pkl


In [561]:
!mkdir data_s3

In [562]:
!mv pytorch_data/word_dict.pkl data_s3/

In [563]:
!mv pytorch_data/train.csv data_s3/

In [564]:
ls data_s3/

train.csv  word_dict.pkl


In [568]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [573]:
input_data

's3://sagemaker-us-east-2-113516067754/drugs_pytorch/data'

In [569]:
!mkdir train

In [570]:
!mv NN_Pytorch_AWS.py model.py train/

## Train in AWS

In [571]:
output_dim=len(classes)
print(output_dim)

225


In [618]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="NN_Pytorch_AWS.py",
                    source_dir="train",
                    role=role,
                    framework_version='1.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    sagemaker_session=sagemaker_session,
                    hyperparameters={
                        'epochs': 10,
                        'embedding_dim':100,
                        'hidden_dim': 200,
                        'output_dim':output_dim
                    })

In [619]:
estimator.fit({'training': input_data})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.2xlarge for training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [612]:
torch.__version__

'1.5.1'