## Communication Graph

This notebook is dedicated to exploration and pattern finding in my cell phone bills which are in PDF formats. The ultimate goal is to make a graph out of it.

After developing a pattern, I'll make a function or a class to do everything for me.

#### Exploration and Pattern Finding

The first section is just exploration.

In [6]:
# Set up.
%matplotlib inline
%pdb on

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import matplotlib as plt
import numpy as np
import os
import pandas as pd
import seaborn

#from tmobile_bill_parser import dictify_bill

import PyPDF2

Automatic pdb calling has been turned ON


In [3]:
"""Module for parsing tmobile bill PDFs."""


def parse_bill(filename):
    """Master function to parse T-Mobile PDFs. Going to be broken up."""

    # ====== init ======
    pdf_bill = PyPDF2.PdfFileReader(open(filename, 'rb'))
    # So far, the relevant information starts on the 3rd (index) page
    bill_dict = {}
    section_dict = {}
    # ====== init ======
    
    for page in range(3, pdf_bill.numPages):

    # ======= Method for preparation of the text. ========
        raw_page = pdf_bill.getPage(page)
        text_page = raw_page.extractText()
        split_text_page = text_page.split('\n')
        while '' in split_text_page:
            split_text_page.remove('')
    # ====================================================

        if 'Total:' in split_text_page: # main function
    
    # ======= Method to handle discontinuous records =======
            # Either the end of the pdf or switching to a new section
            header = split_text_page.index('Date and time')
            end_of_section = split_text_page.index('Total:')
            second_dict = {}
            section_label = split_text_page[header - 2]
            for i, column in enumerate(split_text_page[header:header + 6]):
                column_index = header + i
                second_dict[column] = split_text_page[column_index + 6:end_of_section:6]
            bill_dict[section_label] = {key: section_dict.get(key, []) + second_dict[key] for key in second_dict.keys()}
            if end_of_section + 2 == 'Data':
                start_of_next_section = end_of_section + 4
            else:
                start_of_next_section = end_of_section + 5
            next_section = split_text_page[start_of_next_section::]
            section_dict = {}
            second_dict = {}
            for column in next_section[:6]:
                column_index = next_section.index(column)
                section_dict[column] = next_section[column_index + 6::6]
    # =======================================================

        else:
    # ======= Method to handle continuous list of records =======
            pivot_index = split_text_page.index('Date and time')
            for i, column in enumerate(split_text_page[pivot_index:pivot_index + 6]):
                column_index = pivot_index + i
                values = split_text_page[column_index + 6::6]
                if column in section_dict:
                    section_dict[column] = section_dict[column] + values
                else:
                    section_dict[column] = values
    # ===============================================================

    return bill_dict

In [4]:
def parse_multiple_bills(directory):
    """Takes a list of filenames or a directory and returns several bills."""
    # Input validation -- check if anything other than a list of files or directory name.
    list_of_bills = os.listdir(directory)
    bill_directory = {}
    for bill in list_of_bills:
        path = 'bills/' + bill
        bill_as_key = bill[:-4]
        bill_directory[bill_as_key] = parse_bill(path)
    
    return bill_directory
    
    

In [1]:
# from tmobile_bill_parser import parse_bill

# big_dict = parse_bill('bills/PastBills.pdf')

In [2]:
big_dict

{'Data': {'Amount': ['-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-'],
  'Date and time': ['08/18/17, 5:13 PM', '08/18/17, 11:17 PM'],
  'MB': ['0.0097',
   '0.0097',
   '0.0185',
   '6.2626',
   '0.0263',
   '20.3457',
   '0.6552',
   '24.3515',
   '0.0527',
   '0.0097',
   '0.0097',
   '0.0097',
   '0.0185',
   '0.2890',
   '0.2900',
   '0.0097',
   '0.0722',
   '2.0566',
   '2.2333',
   '2.9843',
   '14.4062',
   '0.0097',
   '0.0097',
   '1.0556',
   '0.0468',
   '0.5644',
   '8.9843',
   '0.4785',
   '0.1494',
   '0.0029',
   '32.6210',
   '4.6445',
   '0.0087',
   '0.2441',
   '6.3115',
   '0.4111',
   '0.3232',
   '0.0097',
   '0.0097'],
  'Origin': ['-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-

In [7]:
big_dict = parse_bill('bills/PastBills.pdf')

In [8]:
big_dict

{'Data': {'Amount': ['-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-'