# CSV Splitter

Agus Gunawan, Holy Lovenia

## Importing dataset

In [77]:
from datetime import datetime
from pandas import read_csv

### Read train data

In [29]:
date_parser = lambda dates: datetime.strptime(dates, '%Y-%m-%d')

In [30]:
train_data = read_csv('dataset/atm_train.csv', parse_dates=['date'], date_parser=date_parser, index_col='date')
train_data.head()

Unnamed: 0_level_0,X1,no. ATM,currency,saldo awal,Deliveries,Returns,unplanned_deliveries,unplanned_returns,deposit,Pre.Withdrawals,Withdrawals,saldo akhir,Trips,Balance.Cost,Carrier.Cost,uang_idle
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-01-01,1,K1,IDR,275600000,0,0,0,0,0,0,78700000,196900000,0,32367,0,45304
2018-01-02,2,K1,IDR,196900000,460000000,146200000,0,0,0,50700000,112900000,397800000,1,65392,350000,32367
2018-01-03,3,K1,IDR,397800000,0,0,0,0,0,0,108700000,289100000,0,47523,0,65392
2018-01-04,4,K1,IDR,289100000,0,0,0,0,0,0,100700000,188400000,0,30970,0,47523
2018-01-05,5,K1,IDR,188400000,460000000,138000000,0,0,0,50400000,127750000,382650000,1,62901,350000,30970


### Read test data

In [32]:
date_parser = lambda dates: datetime.strptime(dates, '%d/%m/%Y')

In [33]:
test_data = read_csv('dataset/atm_test.csv', delimiter=';', parse_dates=['date'], date_parser=date_parser, index_col='date')
test_data.head()

Unnamed: 0_level_0,no. ATM
date,Unnamed: 1_level_1
2018-03-25,K1
2018-03-26,K1
2018-03-27,K1
2018-03-28,K1
2018-03-29,K1


## Preprocessing

In [78]:
from pandas import DataFrame

import os

### Preprocess train data

In [82]:
x_train = train_data[train_data.columns.difference(['currency', 'unplanned_deliveries', 'unplanned_returns', 'Trips', 'deposit', 'X1', 'Pre.Withdrawals', 'Carrier.Cost', 'saldo akhir'])]
x_train = x_train.replace('K', '', regex=True)
x_train.head()

Unnamed: 0_level_0,Balance.Cost,Deliveries,Returns,Withdrawals,no. ATM,saldo awal,uang_idle
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01,32367,0,0,78700000,1,275600000,45304
2018-01-02,65392,460000000,146200000,112900000,1,196900000,32367
2018-01-03,47523,0,0,108700000,1,397800000,65392
2018-01-04,30970,0,0,100700000,1,289100000,47523
2018-01-05,62901,460000000,138000000,127750000,1,188400000,30970


#### Save preprocessed train data as CSV

In [83]:
train_data_dir = 'dataset/train/'

if not os.path.exists(train_data_dir):
    os.makedirs(train_data_dir)

In [84]:
atm_numbers = x_train['no. ATM'].unique()

x_trains = {}

for atm_number in atm_numbers:
    x_trains[atm_number] = x_train[x_train['no. ATM'] == str(atm_number)].drop(['no. ATM'], axis=1)
    x_trains[atm_number].to_csv(train_data_dir + atm_number + '_train.csv')

In [85]:
y_train = train_data['Withdrawals']

### Preprocess test data

In [48]:
x_test = test_data
x_test = x_test.replace('K', '', regex=True)
x_test.head()

Unnamed: 0_level_0,no. ATM
date,Unnamed: 1_level_1
2018-03-25,1
2018-03-26,1
2018-03-27,1
2018-03-28,1
2018-03-29,1


#### Save preprocessed test data as CSV

In [86]:
test_data_dir = 'dataset/test/'

if not os.path.exists(test_data_dir):
    os.makedirs(test_data_dir)

In [87]:
x_test.to_csv(test_data_dir + 'test.csv')