# Sales Transaction Report - Transformation

In [1]:
import pandas as pd
import numpy as np

## Custom Function

In [14]:
def convert_object_to_cat_type(df, columns):
  """
  Convert Object to Category datatype in pandas dataframe
  """
  for col in columns:
    df[col] = df[col].astype('category')

In [3]:
def convert_dtype_to_float_type(df, columns):
  """
  Convert object to float datatype in pandas dataframe
  """
  for col in columns:
    df[col] = df[col].astype(float)

In [4]:
def convert_dtype_to_boolean_type(df, columns):
  """
  Convert object to boolean datatype in pandas dataframe
  """
  for col in columns:
    df[col] = df[col].astype(bool)

In [5]:
def map_boolean(df, columns):
    """
    map yes/no to 1/0
    """
    for col in columns:
        df[col] = df[col].map(dict(Yes = 1, No = 0))

In [6]:
def convert_custom_date(df, columns):
    """
    input -  dd/mm/yyyy
    output - dd-mm-yyyy
    since already format of the date is corrent,
    we will change '/' to '-'
    """
    for col in columns:
        df[col] = df[col].str.replace('/', '-', regex = True)

## Read Data

In [7]:
# https://stackoverflow.com/a/18172249 - encoding reference
df = pd.read_csv('output/PPR-ALL.csv', encoding = "ISO-8859-1", names=['sales_date', 'address', 'postal_code', 'county', 'sales_value', 'not_full_market_price_ind', 'vat_exclusion_ind', 'property_desc', 'property_size_desc' ], nrows=20, skiprows=1)

## Preprocessing data

In [43]:
# remove unused columns
df.drop(['address', 'postal_code', 'property_size_desc'], axis = 1, inplace = True)

In [8]:
# remove starting special character and comma 
df.sales_value = df.sales_value.str.replace('[^\d.]', '', regex = True)

In [9]:
# convert object to float type
convert_dtype_to_float_type(df, ['sales_value'])

In [10]:
# map YES/NO to 1/0 type
map_boolean(df, ['not_full_market_price_ind', 'vat_exclusion_ind'])

In [11]:
# convert date to mm-dd-yyyy standard
# function help us to handle the convertion as required
convert_custom_date(df, ['sales_date'])

In [15]:
# convert object to category type
convert_object_to_cat_type(df, ['county'])

In [38]:
# map new/second-hand properties to 1/0
df['new_home_ind'] = df['property_desc'].map({'New Dwelling house /Apartment' : 1, 'Second-Hand Dwelling house /Apartment' : 0})

In [16]:
df.dtypes

sales_date                     object
address                        object
postal_code                   float64
county                       category
sales_value                   float64
not_full_market_price_ind       int64
vat_exclusion_ind               int64
property_desc                  object
property_size_desc             object
dtype: object