# Import Clipper PDF and add to database

## Imports

In [1]:
import camelot
import numpy as np
import pandas as pd
import plotly.express as px
import PyPDF2

## Functions

In [2]:
def get_trips(filename):
    # read first page
    tables = camelot.read_pdf(filename,
                 pages='1',
                 flavor='stream',
                 table_areas=['0,500,800,100'])

    df_import = tables[0].df
    df_import.columns = df_import.iloc[0].str.title()
    df_import = df_import[1:]
    
    # check if more than one page
    with open(filename, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        pages = reader.numPages

    # read next pages if they exist
    if pages > 1:
        tables = camelot.read_pdf(filename,
                                pages='2-end',
                                flavor='stream',
                                table_areas=['0,560,800,90'])

        for i in range(len(tables)):
            next_page = tables[i].df
            next_page.columns = next_page.iloc[0].str.title()
            next_page = next_page[1:]
            df_import = pd.concat([df_import, next_page])
            # camelot.plot(tables[i], kind='contour').show() # to check table_areas 

    # clean up
    return df_import.reset_index(drop=True).replace('', np.nan)

In [3]:
def categorize(df_import):
       df_import.loc[df_import['Location'] == 'ACT bus', 'Category'] = 'AC Transit'
       df_import.loc[df_import['Transaction Type'] == 'Dual-tag entry transaction, no fare deduction', 'Category'] = 'BART Entrance'
       df_import.loc[df_import['Transaction Type'] == 'Dual-tag exit transaction, fare payment', 'Category'] = 'BART Exit'
       df_import.loc[df_import['Route'] == 'CC60', 'Category'] = 'Cable Car'
       df_import.loc[(df_import['Transaction Type'] == 'Dual-tag entry transaction, maximum fare deducted (purse debit)') &
              (df_import['Route'].isna()), 'Category'] = 'Caltrain Entrance'
       df_import.loc[(df_import['Transaction Type'] == 'Dual-tag exit transaction, fare adjustment (purse rebate)') &
              (df_import['Route'].isna()), 'Category'] = 'Caltrain Exit'
       df_import.loc[(df_import['Transaction Type'] == 'Dual-tag entry transaction, maximum fare deducted (purse debit)') &
              (df_import['Route'] == 'FERRY'), 'Category'] = 'Ferry Entrance'
       df_import.loc[(df_import['Transaction Type'] == 'Dual-tag exit transaction, fare adjustment (purse rebate)') &
              (df_import['Route'] == 'FERRY'), 'Category'] = 'Ferry Exit'
       df_import.loc[df_import['Location'].str[-5:] == '(GGF)', 'Category'] = 'Ferry Exit'
       df_import.loc[df_import['Location'] == 'SFM bus', 'Category'] = 'Muni Bus'
       df_import.loc[df_import['Route'] == 'NONE', 'Category'] = 'Muni Metro'
       df_import.loc[df_import['Location'] == 'SAM bus', 'Category'] = 'SamTrans'
       df_import.loc[(df_import['Transaction Type'] == 'Threshold auto-load at a TransLink Device') |
              (df_import['Transaction Type'] == 'Add value at TOT or TVM'), 'Category'] = 'Reload'

       return df_import

In [4]:
def clean_up(df_import):
    df_import['Transaction Date'] = pd.to_datetime(df_import['Transaction Date'])
    
    for col in ['Debit', 'Credit', 'Balance']:
        df_import[col] = df_import[col].str.replace('$', '').astype(float)
    
    return df_import

## Main

In [5]:
data_k = pd.read_csv('data_k.csv', parse_dates=['Transaction Date'])
df = categorize(clean_up(get_trips('raw_data/rideHistory_1202425091-4.pdf')))
df = pd.concat([df, data_k]).sort_values('Transaction Date', ascending=False).reset_index(drop=True)
df.to_csv('data_k.csv', index=False)

  df_import['Transaction Date'] = pd.to_datetime(df_import['Transaction Date'])
