In [4]:
from IPython.display import display

import time
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 300)

import pickle # save file as binary, load & dump much faster than csv

def load_data(filename):
    with open(filename, 'rb') as fin:
        return pickle.load(fin)
    
def save_data(obj, filename):
    with open(filename, 'wb+') as fout:
        pickle.dump(obj, fout)

import os
root_path = 'C:/Users/yuanl/Documents/MelbDatathon2017/'

In [5]:
def read_patients(i):
    df = pd.read_csv(root_path + 'Final_summary/patient_%d.csv' % i, 
                     parse_dates = ['Dispense_Week'],
                     usecols=['Patient_ID', 'Drug_ID', 'Dispense_Week'])
    df.columns = ['PID', 'DID', 'DispWeek']
    return df

def convert_to_matrix(df):
    assert(set(df.dtypes.values) == {np.dtype('int64'), np.dtype('<M8[ns]')}) # data dtype is either date or int64

    matrix = np.zeros(df.shape, dtype=np.int32)

    year2000 = np.datetime64('2000-01-01')
    for i, (col_dtype, col_name) in enumerate(zip(df.dtypes.values, df.columns)):
        if col_dtype == np.dtype('int64'):
            matrix[:, i] = df[col_name].values
        else:
            matrix[:, i] = (df[col_name].values - year2000).astype('timedelta64[D]')
    return matrix, df.index.values

def read_data(file_range):
    transactions = []
    for i in file_range:
        print(i, end=' ')
        transactions.append(convert_to_matrix(read_patients(i))[0])
    print('finish loading')

    transactions = np.vstack(transactions)

    transactions[:, 0] -= transactions[:, 0].min() # now patient ID starts at 0
    return transactions

In [6]:
%%time
transactions = read_data(range(1, 51)) # 1~50

num_patients = transactions[-1, 0] + 1
print('patients in the training set:', num_patients)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 finish loading
patients in the training set: 558352
Wall time: 57.6 s


In [13]:
def encode_date(string):
    year2000 = np.datetime64('2000-01-01')
    return (np.datetime64(string) - year2000).astype('timedelta64[D]').astype(int)

def decode_date(integer):
    return np.timedelta64(int(integer), 'D') + np.datetime64('2000-01-01')

In [15]:
transactions = transactions[np.in1d(transactions[:, 1], [4861, 4867, 4869])]

In [17]:
transactions = transactions[transactions[:, 2] >= encode_date('2016-01-01')]

In [18]:
transactions = transactions[transactions[:, 2] <= encode_date('2016-12-31')]

In [21]:
unique_PID, encoded_PID, count_PID = np.unique(transactions[:, 0], return_inverse=True, return_counts=True)

In [24]:
num_unique_PID = unique_PID.shape[0]

In [27]:
dose = (np.bincount(encoded_PID, transactions[:, 1]==4861) * 20 + 
        np.bincount(encoded_PID, transactions[:, 1]==4867) * 40 +
        np.bincount(encoded_PID, transactions[:, 1]==4869))

In [29]:
df = pd.DataFrame()
df['PID'] = transactions[:, 0] + 1
df['DID'] = transactions[:, 1]
df['DispWeek'] = [decode_date(d) for d in transactions[:, 2]]

In [31]:
df.to_csv(root_path+'three_drugs_transactions.csv', index=False)

In [41]:
df = pd.DataFrame()
df['PID'] = unique_PID
df['4861'] = np.bincount(encoded_PID, transactions[:, 1]==4861).astype(int)
df['4867'] = np.bincount(encoded_PID, transactions[:, 1]==4867).astype(int)
df['4869'] = np.bincount(encoded_PID, transactions[:, 1]==4869).astype(int)
df['sum'] = df['4861'] * 20 + df['4867'] * 40 + df['4869']

In [43]:
df.to_csv(root_path+'three_drugs_sum.csv', index=False)

In [44]:
plt.plot(df.sum)

NameError: name 'plt' is not defined