In [1]:
import pandas as pd
import numpy as np
import csv
import os
from pandas import ExcelWriter
from GLSettingsByState import GLTaxSettingsByState

In [43]:
def excelSave(df_dict):    
    def write_df(dataframe, df_key):
        if dataframe is None:
            print("error w/ df")
        writer = ExcelWriter(df_key + ".xlsx", engine="xlsxwriter")
        dataframe.to_excel(writer, sheet_name=df_key + "_")
        writer.save()
    if isinstance(df_dict, type(dict())):
        for df_key in df_dict.keys():
            write_df(df_dict.get(df_key, None), df_key)
            
    
def currency_to_float(curr_str):
    """
    input -> currency: str
    output -> currency value: float
    
    Converts str of currency into a float.
    Will be used to apply to columns in DF.
    Note on behavior of round() from pyton docs:-
        'The behavior of round() for floats can be surprising: for example, 
        round(2.675, 2) gives 2.67 instead of the expected 2.68. 
        This is not a bug.'

    
    $1 -> 1.0
    ($1) -> -1.0
    $0 -> 0.00
    $50.50 -> 50.50
    $52,555.87 -> 52555.87
    float('nan') - > 0.0
    525.00 -> 525.00
    """
    # checks if val is already a float, if it is a nan float return 0.0 
    # otherwise return float val
    if isinstance(curr_str, float): 
        if isnan(curr_str): 
            return 0.0
        else: 
            return curr_str
    
    # check if str value is not valid, if so return 0
    if len(curr_str) < 1: 
        # print('Len less than 1: {}'.format(curr_str))
        return 0.0
    
    float_str = ''
    neg_val = False
    # check for negative value
    if '(' in curr_str : neg_val = True 
    # iterate over string, remove unwanted charachters
    for char in curr_str:
        if char in ['(', ')', '$', ',']:
            continue
        else:
            float_str+= char
    
    #print('str check:', float_str)
    # See notes on round() func behavior
    if neg_val == True:
        return float(float_str) * -1
    else:
        return float(float_str)

In [3]:
__author__ = 'nabeelh-dev'

class TaxRates(object):
    """
    Reads tax rate data per Alavara.com
    Dict will be contained in self.tax_rates.
    Will be able to query by zipcode.

    e.g
    TaxRates.query_by_zipcode('99501')
    >>>{
        "region_name": "ALASKA STATE",
        "state_rate" : 0.000000,
        "est_combined_rate": 0.000000,
        "est_country_rate": 0.000000,
        "est_city_rate": 0.000000,
        "est_special_rate": 0.000000,
        "risk_level": 1,
        "zipcode": "99501",
        "state": "AK"
        }
    """
    def __init__(self, csv_path):
        self.filepath = csv_path
        self.tax_rates = self.parse_csv()

    def parse_csv(self):
        """
        Takes str value for directory stored in self.filepath and will process
        all tax rate csv files within. Will return a dict that will be set to
        TaxRates objects self.tax_rates so we can query by zipcode.
        self.tax_rates will contain a dict that has two main keys:
        'state_tax_rates': stores all State -> Zipcode -> TaxRates dict
        'zipcode_to_state': will be a list of lists, sorted by all states zipcode ranges

        zipcode_to_states list will be used for fast querying when looking for zipcodes tax rates.
        It will quickly indicate the state we will need to look inside to find the zipcode tax rates,
        isntead of looping through state_tax_rates dict which would be inefficient.

        CSV file names to parse are in this format: TAXRATES_ZIP5_AK201901.csv
        We will want to extract the state name from the filename.

        e.g states_tax_rates:
        {
            'state_tax_rates': {
                                'AK': {
                                        '99501': {
                                                "region_name": "ALASKA STATE",
                                                "state_rate" : 0.000000,
                                                "est_combined_rate": 0.000000,
                                                "est_country_rate": 0.000000,
                                                "est_city_rate": 0.000000,
                                                "est_special_rate": 0.000000,
                                                "risk_level": 1
                                                },
                                        },
                                },
            'zipcode_to_state': {
                                '82397': 'WY',
                                '83414': 'WY',
                                '89001': 'NV',
                                '89883': 'NV',
                                '99501': 'AK',
                                '99950': 'AK',
                                }
        }

        :param: self.filepath: str
        :return: states_tax_rates: dict
        """
        states_tax_rates = dict()
        zipcode_to_state = dict()

        # directory input, process all .csv contained within directory
        dirpath = os.path.abspath('')
        csv_dp = os.path.join(dirpath, self.filepath)
        for filename in os.listdir(csv_dp):
            # splits filename to identify state: TAXRATES_ZIP5_AK201901.csv
            csv_state_abbr = filename.split("ZIP5_")[1][:2]
            if filename.endswith(".csv"):
                state_tax_csv = os.path.join(csv_dp, filename)
                # Read and get data from csv
                state_tax_data = TaxRates._parse_single_csv(state_tax_csv)
                states_tax_rates[csv_state_abbr] = state_tax_data['zipcode_rates']

        # Create and return a dict mapping zipcode to state
        # This reverse lookup is used by self.query_by_zipcode()
        for state, zipcode_tax_rates in states_tax_rates.items():
            for zipcode, tax_rates in zipcode_tax_rates.items():
                zipcode_to_state[zipcode] = state

        print("Zipcode Tax Rates loaded successfully.")
        return {
                "state_tax_rates": states_tax_rates,
                "zipcode_to_state": zipcode_to_state
                }

    @staticmethod
    def _parse_single_csv(csv_fp):
        """
        Parses csv at file path. Collects and stores all zipcode data as a dict.
        When being read by csv.reader, each row will be a list as follows:
        ['WY', '83118', 'LINCOLN COUNTY', '0.040000', '0.050000', '0.010000', '0', '0', '1']
        Data will be converted to a dict.

        Will also keep track of zipcode values and return
        the 'lowest' zipcode value and highest in the csv file, under the key 'low_high_zipcode'.
        This will be used to help to make query_by_zipcode class function perform faster.
        Will take advantage of the fact that the csv files are all in zipcode value descending order.

        e.g
        {
            'zipcode_rates' : {
                                '83118': {
                                        "region_name": "LINCOLN COUNTY",
                                        "state_rate" : 0.040000,
                                        "est_combined_rate": 0.050000,
                                        "est_country_rate": 0.010000,
                                        "est_city_rate": 0.000000,
                                        "est_special_rate": 0.000000,
                                        "risk_level": 1
                                        },
                                },
            'low_high_zipcode' : [82001, 83414]

        :param csv_fp: os path object
        :return: state_dict: dict
        """
        zipcode_rates = dict()
        with open(csv_fp) as csv_file:
            #print("Opened Tax CSV Successfully: {}".format(str(csv_fp)))
            csv_reader = csv.reader(csv_file)
            # Skip the first line of csv file due to header
            ## Keep header information in the future??
            for i in range(0, 1, 1):
                next(csv_reader, None)

            for zipcode_tax in csv_reader:
                zipcode = zipcode_tax[1]
                region_name = zipcode_tax[2]
                state_rate = float(zipcode_tax[3])
                est_combined_rate = float(zipcode_tax[4])
                est_country_rate = float(zipcode_tax[5])
                est_city_rate = float(zipcode_tax[6])
                est_special_rate = float(zipcode_tax[7])
                risk_level = int(zipcode_tax[8])

                zipcode_rates[zipcode] = {
                                        "region_name": region_name,
                                        "state_rate" : state_rate,
                                        "est_combined_rate": est_combined_rate,
                                        "est_country_rate": est_country_rate,
                                        "est_city_rate": est_city_rate,
                                        "est_special_rate": est_special_rate,
                                        "risk_level": risk_level
                                        }

        return {
                'zipcode_rates': zipcode_rates
                }

    def query_by_zipcode(self, zipcode_str):
        """
        Given a zipcode string, retrieve tax rates for that zipcode.
        First accesses 'zipcodes_to_state' dict contained in self.tax_rates to get
        the State the zipcode is in.
        If zipcode_str does not exist, then print error and return None.

        If it exists then uses State key and Zipcode key to obtain correct tax rates from
        'state_tax_rates' dict in self.tax_rates

        :param zipcode_str: str
        :return: zipcode_tax_rates: dict
        """

        # dict get method returns None if key is not found
        query_state = self.tax_rates['zipcode_to_state'].get(zipcode_str)
        if query_state is None:
            print("{} - zipcode not found!".format(zipcode_str))
            return None
        query_results = self.tax_rates['state_tax_rates'][query_state][zipcode_str]
        query_results['zipcode'] = zipcode_str
        query_results['state'] = str(query_state)
        return query_results

In [4]:
taxRates = TaxRates("TAXRATES_ZIP5/")
taxRates.query_by_zipcode("90247")

Zipcode Tax Rates loaded successfully.


{'region_name': 'GARDENA',
 'state_rate': 0.06,
 'est_combined_rate': 0.095,
 'est_country_rate': 0.0025,
 'est_city_rate': 0.0,
 'est_special_rate': 0.0325,
 'risk_level': 1,
 'zipcode': '90247',
 'state': 'CA'}

In [5]:
with open("REGION_TO_ZIP.csv") as csv_file:
    df = pd.read_csv(csv_file, delimiter=",").set_index("REGION")
    regionToZip = df.to_dict('index')

In [51]:
folder_dir = "test_NAPGLDATA"

df_list = list()
for file in os.listdir(folder_dir):
    if file.endswith(".csv"):
        fp_ = os.path.join(folder_dir, file)
        with open(fp_) as csv_file:
            for i in range(0, 5):
                next(csv_file, i)
            df_list.append(pd.read_csv(csv_file, delimiter=","))
        
core_df = pd.concat(df_list, axis=0, ignore_index=True)

In [52]:
"""
with open("test_NAPGLDATA/CORE_12-18.csv") as csv_file:
    for i in range(0, 5):
        next(csv_file, i)
    core_df = pd.read_csv(csv_file, delimiter=",")
"""

nap_csv_colMap = {
                    'Segment3': 'section',
                    'Segment4': 'area',
                    'Segment5': 'region',
                    'Account Description': 'glAcctDesc',
                    'Record Type::Number': 'Record Type::Number',
                    'CM Trx Type': 'refNum',
                    'TRX Timestamp Date': 'trxDate',
                    'GL Posting Date': 'glDate',
                    'Description': 'trxDesc',
                    'Main Account Segment': 'glAcct',
                    'PaidToRcvd From': 'paidToRcvd',
                    'TRX Amount': 'trxAmount',
                    'Originating Debit Amount': 'debitAmt',
                    'Originating Credit Amount': 'creditAmt'
                    }

core_df.rename(mapper=nap_csv_colMap, inplace=True, axis=1)

In [53]:
core_df["trxAmount"] = core_df["trxAmount"].apply(lambda x: currency_to_float(x))
core_df["debitAmt"] = core_df["debitAmt"].apply(lambda x: currency_to_float(x))
core_df["creditAmt"] = core_df["creditAmt"].apply(lambda x: currency_to_float(x))
core_df["glAcct"] = core_df["glAcct"].astype(np.int64)

In [54]:
taxRates.query_by_zipcode("92040")

{'region_name': 'SAN DIEGO COUNTY',
 'state_rate': 0.06,
 'est_combined_rate': 0.0775,
 'est_country_rate': 0.0025,
 'est_city_rate': 0.0,
 'est_special_rate': 0.015,
 'risk_level': 1,
 'zipcode': '92040',
 'state': 'CA'}

In [55]:
regionToZip[234]

{'CUSTNAME': 'REGION 234',
 'CNTCPRSN': 'Kriste Redman',
 'CITY': 'Lakeside',
 'STATE': 'CA',
 'ZIP': 92040.0,
 'SAR': '11-V-234'}

In [56]:
core_df.shape

(12216, 17)

In [57]:
core_df.head(1)

Unnamed: 0,Record Type::Number,refNum,trxDate,glDate,Trx Number,paidToRcvd,trxDesc,trxAmount,debitAmt,creditAmt,glAcct,Segment2,section,area,region,Segment6,glAcctDesc
0,20::8,Supplier Invoice,2/2/2019,12/9/2018,8,NOCRA,NOCRA Inv ADULT-03 Fall 2018 assignor and forf...,0.0,0.0,162.0,2010,0,0A11,Q,0,0,Accounts Payable/AP Trade


In [58]:
region_df = core_df.loc[core_df["region"] != 0]

region_df["state"] = pd.Series(region_df["region"].apply(lambda x: regionToZip[x]['STATE']))
region_df["city"] = pd.Series(region_df["region"].apply(lambda x: regionToZip[x]['CITY']))
region_df["zipcode"] = pd.Series(region_df["region"].apply(lambda x: regionToZip[x]["ZIP"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [59]:
region_df.drop(["Segment6", "Segment2", "trxDate"], axis=1, inplace=True)
region_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Record Type::Number,refNum,glDate,Trx Number,paidToRcvd,trxDesc,trxAmount,debitAmt,creditAmt,glAcct,section,area,region,glAcctDesc,state,city,zipcode
9,1::8,Bank Payment,12/1/2018,505,GIOVANNI G,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0
10,1::9,Bank Payment,12/1/2018,506,VICTOR G,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0


Anything with a GL code less than 5000 is a Revenue account.
Anything with a GL code above 5000 is an Expense account

In [60]:
"""conditions = [region_df["glAcct"] < 5000,
             region_df["glAcct"] > 5000]
outputs = ["Expense", "Revenue"]

res = np.select(conditions, outputs)
region_df["Exp/Rev"] = pd.Series(res)
"""
region_df["Exp/Rev"] = np.where(region_df['glAcct']>=5000, "Expense", "Revenue")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [61]:
region_df.head(1)

Unnamed: 0,Record Type::Number,refNum,glDate,Trx Number,paidToRcvd,trxDesc,trxAmount,debitAmt,creditAmt,glAcct,section,area,region,glAcctDesc,state,city,zipcode,Exp/Rev
9,1::8,Bank Payment,12/1/2018,505,GIOVANNI G,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0,Expense


In [62]:
region_df["taxableAmt"] = region_df["debitAmt"] + (region_df["creditAmt"] * -1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
region_df.head(3)

Unnamed: 0,Record Type::Number,refNum,glDate,Trx Number,paidToRcvd,trxDesc,trxAmount,debitAmt,creditAmt,glAcct,section,area,region,glAcctDesc,state,city,zipcode,Exp/Rev,taxableAmt
9,1::8,Bank Payment,12/1/2018,505,GIOVANNI G,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0,Expense,30.0
10,1::9,Bank Payment,12/1/2018,506,VICTOR G,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0,Expense,30.0
11,1::10,Bank Payment,12/1/2018,507,MARY P,,30.0,30.0,0.0,5101,0A14,L,1408,Uniforms-Players-TAX PAID,FL,Wildwood,34785.0,Expense,30.0


In [64]:
al_df = region_df.loc[region_df["state"]=="AL"]

In [65]:
state_settings = GLTaxSettingsByState["AL"]
state_settings

{'4010': True,
 '4012': True,
 '4024': True,
 '4027': True,
 '5101': False,
 '5102': False,
 '5103': False,
 '5104': False,
 '5105': True,
 '5106': True,
 '5107': True,
 '5108': True,
 '5135': True}

In [66]:
taxable_gl = [int(x) for x,y in state_settings.items() if y]
taxable_gl

[4010, 4012, 4024, 4027, 5105, 5106, 5107, 5108, 5135]

In [67]:
# if glAcct is in Taxable_gl code list then mark as Taxable and if not then mark Non-Taxable
al_df["Taxable?"] = np.where(al_df['glAcct'].isin(taxable_gl), "Taxable", "Non-Taxable")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [68]:
parsed_al = al_df.sort_values(by=["Taxable?", "Trx Number"], ascending=False)

In [69]:
al_df[al_df["taxableAmt"]==1402.79]

Unnamed: 0,Record Type::Number,refNum,glDate,Trx Number,paidToRcvd,trxDesc,trxAmount,debitAmt,creditAmt,glAcct,section,area,region,glAcctDesc,state,city,zipcode,Exp/Rev,taxableAmt,Taxable?
11969,20::28,Supplier Invoice,12/13/2018,28,AMERICAN SOCCER COMPANY INC,UNITED 7031 - UNIFORMS / MULTIPLE INVOICES (65...,0.0,1402.79,0.0,5101,70,N,7031,Uniforms-Players-TAX PAID,AL,Madison,35756.0,Expense,1402.79,Non-Taxable
11970,22::26,Supplier Payment,12/13/2018,84,AMERICAN SOCCER COMPANY INC,UNITED 7031 - UNIFORMS,1402.79,1402.79,0.0,2015,70,N,7031,Accounts Payable/AP Trade,AL,Madison,35756.0,Revenue,1402.79,Non-Taxable
11972,0::6,Journal Entry,12/13/2018,26,,UNITED 7031 - CODING ADJUSTMENT,0.0,1402.79,0.0,5105,70,N,7031,Uniforms-Players-NO TAX PAID,AL,Madison,35756.0,Expense,1402.79,Taxable


In [71]:
excelSave({"al_test_all": parsed_al})