# Date create: January 8, 2022

This file concatenates the different calendar file chunks into a single, compressed .npz file.

In [1]:
# Import libraries --

import numpy as np
import pandas as pd
import os
import time
from datetime import datetime

In [2]:
# Manage directories --

# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

data_path = cwd2 + '/Saved data/'

In [3]:
# Example of loading one chunk --

os.chdir(data_path)

# Read in one example chunk and look at it.
test_df = pd.read_csv('booked_df_v2_3.csv')
test_df.head(5)

Unnamed: 0.1,Unnamed: 0,booked,id,res_date,never_avail,price,last_date,all_prices,final_prices,seen_avail,week_yr,mo_yr
0,0,False,880611,2015-09-02,True,,,,,0,2015-35,2015-09
1,1,False,916916,2015-09-02,False,,,150.0,150.0,1,2015-35,2015-09
2,2,False,1024996,2015-09-02,True,,,,,0,2015-35,2015-09
3,3,False,921586,2015-09-02,True,,,,,0,2015-35,2015-09
4,4,False,1042814,2015-09-02,False,,,160.0,160.0,1,2015-35,2015-09


# Concatenating chunks back together

In [4]:
def chunk_simplified(file_name, keep_cols):
    
    """ This function takes a csv that has been saved through Pandas
    and returns a numpy array. The keep_cols list specifies which columns
    should be kept from the CSV file. """
    
    df = pd.read_csv(file_name)
    df = df[keep_cols]
    
    # Make booked a boolean
    df.loc[:, 'booked'] = df['booked'].values*1
    
    return df.values

In [5]:
# Concatenate chunks --

# Extract all files in the directory
dir_files = os.listdir(os.getcwd())

# Count how many chunk files we have
N_chunk_files = len([s for s in dir_files if "booked_df_v2" in s])-1

# Select columns to keep 
# Delete 'Unnamed: 0', 'never_avail', 'price', 'all_prices' columns. 
my_cols = ['id', 'res_date','booked', 'seen_avail', 'final_prices', 'last_date', 'week_yr', 'mo_yr']

file_name_base = 'booked_df_v2_'

my_loop_timer = time.time()

for ind in range(N_chunk_files):

    file_i = file_name_base + str(ind) + ".csv"
    print(file_i)
    
    arr_i = chunk_simplified(file_i, my_cols)
    
    if ind == 0:
        
        full_arr = arr_i
        
    if ind > 0:
        full_arr = np.concatenate([full_arr, arr_i], axis=0)

# =======================================================

mins_to_run = (time.time() - my_loop_timer)/60

print("Minutes for concatenation loop:")
print(np.round(mins_to_run, 4))

booked_df_v2_0.csv
booked_df_v2_1.csv
booked_df_v2_2.csv
booked_df_v2_3.csv
booked_df_v2_4.csv
booked_df_v2_5.csv
booked_df_v2_6.csv
booked_df_v2_7.csv
booked_df_v2_8.csv
booked_df_v2_9.csv
booked_df_v2_10.csv
booked_df_v2_11.csv
booked_df_v2_12.csv
booked_df_v2_13.csv
booked_df_v2_14.csv
booked_df_v2_15.csv
booked_df_v2_16.csv
booked_df_v2_17.csv
booked_df_v2_18.csv
booked_df_v2_19.csv
booked_df_v2_20.csv
booked_df_v2_21.csv
booked_df_v2_22.csv
booked_df_v2_23.csv
booked_df_v2_24.csv
booked_df_v2_25.csv
booked_df_v2_26.csv
booked_df_v2_27.csv
booked_df_v2_28.csv
booked_df_v2_29.csv
booked_df_v2_30.csv
booked_df_v2_31.csv
booked_df_v2_32.csv
booked_df_v2_33.csv
booked_df_v2_34.csv
booked_df_v2_35.csv
booked_df_v2_36.csv
booked_df_v2_37.csv
booked_df_v2_38.csv
booked_df_v2_39.csv
booked_df_v2_40.csv
booked_df_v2_41.csv
booked_df_v2_42.csv
booked_df_v2_43.csv
booked_df_v2_44.csv
booked_df_v2_45.csv
booked_df_v2_46.csv
booked_df_v2_47.csv
booked_df_v2_48.csv
booked_df_v2_49.csv
booked_df_

  arr_i = chunk_simplified(file_i, my_cols)


booked_df_v2_58.csv
booked_df_v2_59.csv
booked_df_v2_60.csv
booked_df_v2_61.csv
booked_df_v2_62.csv
booked_df_v2_63.csv
booked_df_v2_64.csv
booked_df_v2_65.csv
booked_df_v2_66.csv
booked_df_v2_67.csv
booked_df_v2_68.csv
booked_df_v2_69.csv
booked_df_v2_70.csv
booked_df_v2_71.csv
booked_df_v2_72.csv
booked_df_v2_73.csv
booked_df_v2_74.csv
booked_df_v2_75.csv
booked_df_v2_76.csv
booked_df_v2_77.csv
booked_df_v2_78.csv
booked_df_v2_79.csv
booked_df_v2_80.csv
Minutes for concatenation loop:
0.4977


# Saving compressed data

In [12]:
# Save compressed file --

npz_file_name = "Portland_booked_df"

my_date = str(datetime.now().year) + "_" + str(datetime.now().month) + "_" + str(datetime.now().day)

my_save_timer = time.time()

np.savez_compressed(npz_file_name, 
                    list_id = full_arr[:,0], 
                    res_date = full_arr[:,1],
                    book_status = full_arr[:,2],
                    seen_avail = full_arr[:, 3],
                    prices = full_arr[:, 4],
                    other_dates = full_arr[:, -3:], # These dates might end up being pretty important for merging later on.
                    col_names = my_cols,
                    date_of_file = my_date)

# =======================================================

#save_mins_to_run = (time.time() - my_save_timer)/60

#print("Minutes for concatenation loop:")
#print(np.round(save_mins_to_run, 4))

# Loading in data after it has been saved

In [7]:
quick_test = np.load('Portland_booked_df.npz', allow_pickle=True)
print(quick_test.files)

['list_id', 'res_date', 'book_status', 'seen_avail', 'prices', 'other_dates', 'col_names', 'date_of_file']


In [8]:
# Load in ids and dates

my_ids = quick_test['list_id']
my_dates = quick_test['res_date']

In [9]:
# Add a second axis to arrays

np.array([my_ids]).T.shape, np.array([my_dates]).T.shape

((12911400, 1), (12911400, 1))

In [10]:
# Use concatenation to create a new numpy array

rejoined_example = np.concatenate([np.array([my_ids]).T, np.array([my_dates]).T], axis=1)
print(rejoined_example.shape)

(12911400, 2)


In [11]:
# Can rebuild DataFrame in this way

example_df = pd.DataFrame(rejoined_example[0:10,:]) # Slice just to make it smaller. This is just illustrative
example_df.columns = quick_test['col_names'][0:2]

example_df

Unnamed: 0,id,res_date
0,9356,2015-09-02
1,39938,2015-09-02
2,156479,2015-09-02
3,41601,2015-09-02
4,187797,2015-09-02
5,145199,2015-09-02
6,203797,2015-09-02
7,240583,2015-09-02
8,263318,2015-09-02
9,42347,2015-09-02
