## Imports

In [1]:
import pandas as pd

## Helper methods

In [2]:
def missing_values_counts(df: pd.DataFrame) -> pd.Series:
    """Returns number of missing values in each column"""
    return df.isnull().sum()

def missing_values_percent(df: pd.DataFrame) -> pd.Series:
    """Returns percentage of missing values in each column"""
    percent_null = (df.isnull().sum() / df.shape[0]) * 100
    return pd.Series(percent_null, index=df.columns)

def remove_null_columns(df: pd.DataFrame, limit=50, zero_is_null=True) -> pd.DataFrame:
    """Drops all columns with more than limit percent null values"""
    rows  = df.shape[0]
    cols  = df.columns
    drops = []
    
    percent_null = lambda c: (df[c].isnull().sum() / rows) * 100
    is_zero      = lambda v: True if v == 0 else False
    percent_zero = lambda c: (df[c].apply(is_zero).sum() / rows) * 100
    
    for col in cols:
        if percent_null(col) > limit:
            drops.append(col)
    
        if (zero_is_null) and (percent_zero(col) > limit):
            drops.append(col)

    return df.drop(drops, 1)


## Reading Data

In [3]:
customer_path = "data/customer.csv"
inventory_path = "data/inventory.csv"
invoice_path = "data/invoice.csv"

customer = pd.read_csv(customer_path)
inventory = pd.read_csv(inventory_path)
invoice = pd.read_csv(invoice_path)

print(customer.shape)
print(inventory.shape)
print(invoice.shape)

(555338, 9)
(1048575, 10)
(492314, 59)


## Dropping colums with more than 50% null values

In [4]:
customer = remove_null_columns(customer)
inventory = remove_null_columns(inventory)
invoice = remove_null_columns(invoice)

print(customer.shape)
print(inventory.shape)
print(invoice.shape)

(555338, 5)
(1048575, 9)
(492314, 29)


## Dropping unnecessary columns

In [5]:
inventory = inventory.drop(['Unnamed: 0'], 1)
invoice = invoice.drop(['Unnamed: 0'], 1)


## Renaming columns to python friendly names

In [6]:
columns = {
    'Business Partner': 'business_partner', 
    'Customer No.': 'customer_no',
    'Partner Type': 'partner_type',
    'Data Origin': 'data_origin',
    'Title': 'title'
    }

customer = customer.rename(columns=columns)
customer.columns

Index(['business_partner', 'customer_no', 'partner_type', 'data_origin',
       'title'],
      dtype='object')

In [7]:
columns = {
    'DBM Order': 'job_card_no', 
    'Order Item': 'order_item',
    'Material': 'material',
    'Description': 'description',
    'Item Category': 'item_category',
    'Order Quantity': 'order_quantity',
    'Target quantity UoM': 'target_quantity_unit',
    'Net value': 'net_value'
    }

inventory = inventory.rename(columns=columns)
inventory.columns

Index(['job_card_no', 'order_item', 'material', 'description', 'item_category',
       'order_quantity', 'target_quantity_unit', 'net_value'],
      dtype='object')

In [8]:
columns = {
    'Area / Locality': 'locality',
    'CITY': 'city',
    'Cust Type': 'customer_type',
    'Customer No.': 'customer_no',
    'District': 'district',
    'Gate Pass Time': 'gate_pass_time', 
    'Invoice Date': 'invoice_date',
    'Invoice No': 'invoice_no' ,
    'Invoice Time': 'invoice_time',
    'Job Card No': 'job_card_no',
    'JobCard Date': 'job_card_date',
    'JobCard Time': 'job_card_time',
    'KMs Reading': 'km_reading',
    'Labour Total': 'labour_total',
    'Make': 'make',
    'Model': 'model',
    'ODN No.': 'odn_no',
    'Order Type': 'order_type',
    'Parts Total': 'parts_total',
    'Pin code': 'pin_code',
    'Plant': 'plant_code',
    'Plant Name1': 'plant_name',
    'Print Status': 'print_status',
    'Regn No': 'regn_no',
    'Technician Name': 'technician_name',
    'Total Amt Wtd Tax.': 'total_amt',
    'Total Value': 'total_value',
    'User ID': 'user_id'
    }

invoice = invoice.rename(columns=columns)
invoice.columns

Index(['locality', 'city', 'customer_type', 'customer_no', 'district',
       'gate_pass_time', 'invoice_date', 'invoice_no', 'invoice_time',
       'job_card_no', 'job_card_date', 'job_card_time', 'km_reading',
       'labour_total', 'make', 'model', 'odn_no', 'order_type', 'parts_total',
       'pin_code', 'plant_code', 'plant_name', 'print_status', 'regn_no',
       'technician_name', 'total_amt', 'total_value', 'user_id'],
      dtype='object')

## Merging customer df with invoice df

In [9]:
customer_invoice = pd.merge(customer, invoice, how='left', on='customer_no')

In [10]:
customer_invoice.shape

(571416, 32)

## Merging customer&invoice df with inventory df

In [11]:
clean_df = pd.merge(customer_invoice, inventory, how='left', on='job_card_no')

In [12]:
clean_df.shape

(593935, 39)

In [14]:

missing_values_percent(clean_df)

business_partner         0.000000
customer_no              0.000000
partner_type             0.620270
data_origin              1.555726
title                   13.240506
locality                99.892918
city                    92.887100
customer_type           92.887100
district                92.887100
gate_pass_time          92.887100
invoice_date            92.887100
invoice_no              92.887100
invoice_time            92.887100
job_card_no             92.887100
job_card_date           92.887100
job_card_time           92.887100
km_reading              92.887100
labour_total            92.887100
make                    92.887100
model                   92.892320
odn_no                  93.109515
order_type              92.887100
parts_total             92.887100
pin_code                92.887100
plant_code              92.887100
plant_name              92.887100
print_status            92.887100
regn_no                 92.887100
technician_name         94.852635
total_amt     