## Communication Graph

This notebook is dedicated to exploration and pattern finding in my cell phone bills which are in PDF formats. The ultimate goal is to make a graph out of it.

After developing a pattern, I'll make a function or a class to do everything for me.

#### Exploration and Pattern Finding

The first section is just exploration.

In [1]:
# Set up.
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import matplotlib as plt
import numpy as np
import os
import pandas as pd
import seaborn

import PyPDF2

In [12]:
# I'll write a completely different function for handling directories

def dictify_bill(filename):
    pdf_bill = PyPDF2.PdfFileReader(open(filename, 'rb'))
    # So far, the relevant information starts on the 3rd (index) page
    bill_dict = {}
    section_dict = {}
    for page in range(3, pdf_bill.numPages):
        raw_page = pdf_bill.getPage(page)
        text_page = raw_page.extractText()
        split_text_page = text_page.split('\n')
        while '' in split_text_page:
            split_text_page.remove('')
        if split_text_page.count('Date and time') > 1:
            header = split_text_page.index('Date and time')
            end_of_section = split_text_page.index('Total:')
            second_dict = {}
            section_label = split_text_page[header - 2]
            for i, column in enumerate(split_text_page[header:header + 6]):
                column_index = header + i
                second_dict[column] = split_text_page[column_index + 6:end_of_section:6]
            # import pdb; pdb.set_trace()
            bill_dict[section_label] = {key: section_dict[key] + second_dict[key] for key in second_dict.keys()}
            # Update the original header's dictionary
            if end_of_section + 2 == 'Data':
                start_of_next_section = end_of_section + 4
            else:
                start_of_next_section = end_of_section + 5
            next_section = split_text_page[start_of_next_section::]
            section_dict = {}
            second_dict = {}
            for column in next_section[:6]:
                column_index = next_section.index(column)
                section_dict[column] = next_section[column_index + 6::6]
        else:
            pivot_index = split_text_page.index('Date and time')
            for i, column in enumerate(split_text_page[pivot_index:pivot_index + 6]):
                column_index = pivot_index + i
                values = split_text_page[column_index + 6::6]
                if column in section_dict:
                    section_dict[column] = section_dict[column] + values
                else:
                    section_dict[column] = values
            
    return bill_dict

In [13]:
dicty = dictify_bill('bills/PastBills.pdf')

In [14]:
dicty

{'Talk': {'Amount': ['-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-',
   '-'],
  'Date and time': ['07/20/17, 4:36 PM',
   '07/20/17, 8:55 PM',
   '07/20/17, 8:56 PM',
   '07/21/17, 12:41 PM',
   '07/21/17, 6:47 PM',
   '07/22/17, 9:58 AM',
   '07/24/17, 12:21 PM',
   '07/24/17, 6:40 PM',
   '07/25/17, 7:47 PM',
   '07/26/17, 6:11

In [None]:
bill = PyPDF2.PdfFileReader(open('bills/PastBills.pdf', 'rb'))

In [None]:
bill.numPages

In [None]:
usage_page_1 = bill.getPage(3)

In [None]:
text_page_1 = usage_page_1.extractText()

In [None]:
split_text_1 = text_page_1.split('\n')

In [None]:
split_text_1.index('Date and time')

In [None]:
split_text_1.remove('')

In [None]:
bill_dict = {}
for column in split_text_1[36:42]:
    column_index = split_text_1.index(column)
    bill_dict[column] = split_text_1[column_index + 6::6]

In [None]:
len(bill_dict['Date and time'])

In [None]:
for key, value in bill_dict.items():
    print(len(value))

In [None]:
bill_df = pd.DataFrame(bill_dict)

In [None]:
usage_page_2 = bill.getPage(4)
text_page_2 = usage_page_2.extractText()
split_text_page_2 = text_page_2.split('\n')
while '' in split_text_page_2:
    split_text_page_2.remove('')

In [None]:
split_text_page_2[37::].index('Date and time')

In [None]:
# if the count of 'Date and time' > 1:
first_header = split_text_page_2.index('Date and time')
second_header = split_text_page_2.index('Total:')

In [None]:
first_header
second_header

In [None]:
end_of_section = second_header

This is where the call, text, or data sections stop. 

In [None]:
second_dict = {}
for column in split_text_page_2[first_header:first_header + 6]:
    column_index = split_text_page_2.index(column)
    second_dict[column] = split_text_page_2[column_index + 6:end_of_section:6]

In [None]:
start_of_next_section = end_of_section + 5
next_section = split_text_page_2[start_of_next_section::]
new_text_dict = {}
for column in next_section[:6]:
    column_index = next_section.index(column)
    new_text_dict[column] = next_section[column_index + 6::6]

In [None]:
usage_page_3 = bill.getPage(5)
text_page_3 = usage_page_3.extractText()
split_text_page_3 = text_page_3.split('\n')
while '' in split_text_page_3:
    split_text_page_3.remove('')

In [None]:
split_text_page_3.count('Date and time')

In [None]:
pivot_index = split_text_page_3.index('Date and time')
text_dict_3 = {}
for column in split_text_page_3[pivot_index:pivot_index + 6]:
    column_index = split_text_page_3.index(column)
    text_dict_3[column] = split_text_page_3[column_index + 6::6]

In [None]:
text_dict_3