## Communication Graph

This notebook is dedicated to exploration and pattern finding in my cell phone bills which are in PDF formats. The ultimate goal is to make a graph out of it.

After developing a pattern, I'll make a function or a class to do everything for me.

#### Exploration and Pattern Finding

The first section is just exploration.

In [1]:
# Set up.

import numpy as np
import os
import pandas as pd
import PyPDF2
import re
import sys

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from src.tmobile_bill_parser import (parse_bill, parse_multiple_bills)
%load_ext autoreload
%autoreload 2

### Introduction

- [x] Make all data the proper datatype.
- [n/a] Separate destination into city and state.
- [x] Numbers must be in roughly the same format.
- [x] Treat Text, Data, and Talk as separate tables or graphs.

In [2]:
kurts_bills = parse_multiple_bills('../bills/Kurt/')

In [3]:
mom_and_dad_bills = parse_multiple_bills('../bills/Mom_and_Dad/')

In [4]:
for key, value in mom_and_dad_bills['aug16-sep16'][0]['Text'].items():
    print(key, len(value))

Date and time 31
Number 31
Destination 31
Direction 31
Type 31
Amount 31


In [5]:
kurt_text_dfs = [pd.DataFrame(bill_period[0]['Text']) for bill_period in kurts_bills.values()]
kurt_data_dfs = [pd.DataFrame(bill_period[0]['Data']) for bill_period in kurts_bills.values()]
kurt_talk_dfs = [pd.DataFrame(bill_period[0]['Talk']) for bill_period in kurts_bills.values()]
dad_text_dfs = [pd.DataFrame(billy[0]['Text']) for billy in mom_and_dad_bills.values()]
dad_data_dfs = [pd.DataFrame(billy[0]['Data']) for billy in mom_and_dad_bills.values()]
dad_talk_dfs = [pd.DataFrame(billy[0]['Talk']) for billy in mom_and_dad_bills.values()]
mom_text_dfs = [pd.DataFrame(billy[1]['Text']) for billy in mom_and_dad_bills.values()]
mom_data_dfs = [pd.DataFrame(billy[1]['Data']) for billy in mom_and_dad_bills.values()]
mom_talk_dfs = [pd.DataFrame(billy[1]['Talk']) for billy in mom_and_dad_bills.values()]
kurt_text_df = pd.concat(kurt_text_dfs).reset_index()
kurt_data_df = pd.concat(kurt_data_dfs).reset_index()
kurt_talk_df = pd.concat(kurt_talk_dfs).reset_index()
dad_text_df = pd.concat(dad_text_dfs).reset_index()
dad_data_df = pd.concat(dad_data_dfs).reset_index()
dad_talk_df = pd.concat(dad_talk_dfs).reset_index()
mom_text_df = pd.concat(mom_text_dfs).reset_index()
mom_data_df = pd.concat(mom_data_dfs).reset_index()
mom_talk_df = pd.concat(mom_talk_dfs).reset_index()

In [6]:
kurt_text_df['Amount'].value_counts()
dad_text_df['Amount'].value_counts()
mom_text_df['Amount'].value_counts()
kurt_text = kurt_text_df.drop(['Amount'], axis=1)
dad_text = dad_text_df.drop(['Amount'], axis=1)
mom_text = mom_text_df.drop(['Amount'], axis=1)

-    13621
Name: Amount, dtype: int64

-    901
Name: Amount, dtype: int64

-    1643
Name: Amount, dtype: int64

In [7]:
kurt_data_df.info()
mom_data_df.info()
dad_data_df.info()
kurt_data = kurt_data_df.drop(['Amount', 'Origin', 'Type', 'Service'], axis=1)
dad_data = dad_data_df.drop(['Amount', 'Origin', 'Type', 'Service'], axis=1)
mom_data = mom_data_df.drop(['Amount', 'Origin', 'Type', 'Service'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4345 entries, 0 to 4344
Data columns (total 7 columns):
index            4345 non-null int64
Amount           4345 non-null object
Date and time    4345 non-null object
MB               4345 non-null object
Origin           4345 non-null object
Service          4345 non-null object
Type             4345 non-null object
dtypes: int64(1), object(6)
memory usage: 237.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6055 entries, 0 to 6054
Data columns (total 7 columns):
index            6055 non-null int64
Amount           6055 non-null object
Date and time    6055 non-null object
MB               6055 non-null object
Origin           6055 non-null object
Service          6055 non-null object
Type             6055 non-null object
dtypes: int64(1), object(6)
memory usage: 331.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6209 entries, 0 to 6208
Data columns (total 7 columns):
index            6209 non-null int64
Amount     

In [8]:
kurt_talk_df['Amount'].value_counts()
dad_talk_df['Amount'].value_counts()
mom_talk_df['Amount'].value_counts()
kurt_talk = kurt_talk_df.drop(['Amount', 'Type'], axis=1)
dad_talk = dad_talk_df.drop(['Amount', 'Type'], axis=1)
mom_talk = mom_talk_df.drop(['Amount', 'Type'], axis=1)

-    1683
Name: Amount, dtype: int64

-    1828
Name: Amount, dtype: int64

-    5442
Name: Amount, dtype: int64

In [9]:
kurt_talk.head()
kurt_text.head()
kurt_data.head()
dad_talk.head()
dad_text.head()
dad_data.head()
mom_talk.head()
mom_text.head()
mom_data.head()

Unnamed: 0,index,Date and time,Description,Min,Number
0,0,"04/19/16, 10:01 AM",to GRAND PRAR/TX,1,(469) 531-9999
1,1,"04/19/16, 11:53 AM",to AUBURN/AL,7,(334) 728-0615
2,2,"04/19/16, 3:32 PM",to AUBURN/AL,14,(334) 728-0615
3,3,"04/20/16, 6:30 AM",Incoming,7,(334) 728-0615
4,4,"04/20/16, 6:48 AM",to OPELIKA/AL,7,(334) 559-0212


Unnamed: 0,index,Date and time,Destination,Direction,Number,Type
0,0,"04/19/16, 12:14 PM","Auburn, AL",Incoming,(334) 703-1602,Text
1,1,"04/19/16, 12:14 PM","Auburn, AL",Incoming,(334) 703-1602,Text
2,2,"04/19/16, 12:59 PM","Auburn, AL",Outgoing,(334) 703-1602,Text
3,3,"04/19/16, 1:02 PM","Auburn, AL",Incoming,(334) 703-1602,Text
4,4,"04/19/16, 1:22 PM","Auburn, AL",Incoming,(334) 703-1602,Text


Unnamed: 0,index,Date and time,MB
0,0,"04/19/16, 12:15 AM",0.0097
1,1,"04/19/16, 5:15 AM",0.0087
2,2,"04/19/16, 6:20 AM",0.0644
3,3,"04/19/16, 6:22 AM",0.1093
4,4,"04/19/16, 6:26 AM",0.2158


Unnamed: 0,index,Date and time,Description,Min,Number
0,0,"04/14/17, 6:27 PM",Incoming,2,(334) 745-6271
1,1,"04/16/17, 12:02 PM",Incoming,6,(904) 844-1772
2,2,"04/17/17, 1:40 PM",to AUBURN/AL,1,(334) 728-0615
3,3,"04/17/17, 1:53 PM",Incoming,3,(334) 728-0615
4,4,"04/17/17, 4:08 PM",to AUBURN/AL,2,(334) 501-3080


Unnamed: 0,index,Date and time,Destination,Direction,Number,Type
0,0,"04/17/17, 3:36 AM","Columbus, GA",Incoming,(706) 315-2203,Text
1,1,"04/17/17, 3:37 AM","Columbus, GA",Outgoing,(706) 315-2203,Text
2,2,"04/17/17, 3:51 AM","Columbus, GA",Incoming,(706) 615-2869,Text
3,3,"04/17/17, 2:44 PM","New York, NY",Incoming,(646) 760-4418,Text
4,4,"04/18/17, 5:11 AM","Montgomery, AL",Incoming,(334) 721-5239,Text


Unnamed: 0,index,Date and time,MB
0,0,"04/14/17, 1:15 AM",0.33
1,1,"04/14/17, 3:15 AM",0.4755
2,2,"04/14/17, 5:15 AM",1.3935
3,3,"04/14/17, 5:54 AM",0.8281
4,4,"04/14/17, 7:54 AM",0.2216


Unnamed: 0,index,Date and time,Description,Min,Number
0,0,"04/14/17, 6:26 AM",to FERNNDNBCH/FL,5,(904) 844-1772
1,1,"04/14/17, 6:40 AM",Incoming,7,(904) 844-1772
2,2,"04/14/17, 7:59 AM",1-877 #,4,8773631303
3,3,"04/14/17, 8:03 AM",1-877 #,12,8773631303
4,4,"04/14/17, 10:18 AM",Incoming,3,(267) 954-9509


Unnamed: 0,index,Date and time,Destination,Direction,Number,Type
0,0,"04/15/17, 10:11 AM","Fernndnbch, FL",Incoming,(904) 844-1772,Text
1,1,"04/15/17, 4:26 PM","Opelika, AL",Outgoing,(334) 524-9020,Text
2,2,"04/16/17, 9:10 AM","Fernndnbch, FL",Incoming,(904) 844-1772,Text
3,3,"04/17/17, 10:32 AM","Fernndnbch, FL",Outgoing,(904) 844-1772,Text
4,4,"04/17/17, 10:35 AM","Fernndnbch, FL",Incoming,(904) 844-1772,Text


Unnamed: 0,index,Date and time,MB
0,0,"04/14/17, 1:24 AM",0.2373
1,1,"04/14/17, 3:24 AM",0.4091
2,2,"04/14/17, 4:47 AM",0.4199
3,3,"04/14/17, 6:15 AM",0.2832
4,4,"04/14/17, 8:15 AM",0.5019


In [10]:
kurt_data['Date and time'] = pd.to_datetime(kurt_data['Date and time'])
kurt_text['Date and time'] = pd.to_datetime(kurt_text['Date and time'])
kurt_talk['Date and time'] = pd.to_datetime(kurt_talk['Date and time'])
kurt_data['MB'] = pd.to_numeric(kurt_data['MB'])
kurt_talk['Min'] = pd.to_numeric(kurt_talk['Min'])

dad_data['Date and time'] = pd.to_datetime(dad_data['Date and time'])
dad_text['Date and time'] = pd.to_datetime(dad_text['Date and time'])
dad_talk['Date and time'] = pd.to_datetime(dad_talk['Date and time'])
dad_data['MB'] = pd.to_numeric(dad_data['MB'])
dad_talk['Min'] = pd.to_numeric(dad_talk['Min'])

mom_data['Date and time'] = pd.to_datetime(mom_data['Date and time'])
mom_text['Date and time'] = pd.to_datetime(mom_text['Date and time'])
mom_talk['Date and time'] = pd.to_datetime(mom_talk['Date and time'])
mom_data['MB'] = pd.to_numeric(mom_data['MB'])
mom_talk['Min'] = pd.to_numeric(mom_talk['Min'])

I think the Data column is good for a seeing usage over a period of time, maybe seeing if there's a pattern in the my activity over the course of a day, or days of the week I'm more active. Otherwise, I may cache that for later.

I think there's a number of graphs to be made form the Text and Talk sets.

#### Text 

- A graph between me and identifiable phone numbers, outgoing.
- A graph between me and identifiable phone numbers, incoming.
- A graph between me (Seattle) and destinations, though this may not be accurate since the destination seems to be based on the area code of the phone number.
- Activity over a day, week, or month.

#### Talk
- A weighted graph showing calls between phone numbers (people) and time talking.
- A graph between me and identifiable phone numbers, outgoing.
- A graph between me and identifiable phone numbers, incoming.

In [11]:
# data.columns
# for column in data.columns:
#     try:
#         data[column].value_counts()['-']
#     except KeyError:
#         print(0)

- [x] TODO: text['Destination'] contains 629 '-'.  
- [x] TODO: phone numbers need to be normalized.  
- [x] TODO: destination needs to be normalized.

In [12]:
#  Worth noting that area codes will never begin with 1.
phone_number_re = re.compile(r'''1?(-|\s|\.)?(\d{3}|\(\d{3}\))(-|\s|\.)?\d{3}(-|\s|\.)?\d{4}''', re.VERBOSE)
phone_str = r'1?(-|\s|\.)?(\d{3}|\(\d{3}\))(-|\s|\.)?\d{3}(-|\s|\.)?\d{4}'
destination_re = re.compile(r'(\w), (\w)')
destination_str = r'([\w\s]+), (\w+)'

In [13]:
kurt_text_num_bool = kurt_text['Number'].str.match(phone_str)
kurt_text_dest_bool = kurt_text['Destination'].str.match(destination_str)
kurt_talk_num_bool = kurt_talk['Number'].str.match(phone_str)
dad_text_num_bool = dad_text['Number'].str.match(phone_str)
dad_text_dest_bool = dad_text['Destination'].str.match(destination_str)
dad_talk_num_bool = dad_talk['Number'].str.match(phone_str)
mom_text_num_bool = mom_text['Number'].str.match(phone_str)
mom_text_dest_bool = mom_text['Destination'].str.match(destination_str)
mom_talk_num_bool = mom_talk['Number'].str.match(phone_str)

In [14]:
# dest_negatives = text[text_dest_bool == False]['Destination'].value_counts()
# text_negatives = text[text_num_bool == False]['Number'].value_counts()
# talk_negatives = talk[talk_num_bool == False]['Number'].value_counts()

In [15]:
def format_phone(number):
    numb = number.string
    if numb[0] == '1':
        numb = numb[1:]
    return '({}) {}-{}'.format(numb[:3], numb[3:6], numb[6:])

In [16]:
kurt_text_norm_numbers = kurt_text[kurt_text_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)
kurt_talk_norm_numbers = kurt_talk[kurt_talk_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)

dad_text_norm_numbers = dad_text[dad_text_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)
dad_talk_norm_numbers = dad_talk[dad_talk_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)

mom_text_norm_numbers = mom_text[mom_text_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)
mom_talk_norm_numbers = mom_talk[mom_talk_num_bool == True]['Number'].str.replace(r'\d{10,11}', format_phone)

In [17]:
kurt_final_text = kurt_text[kurt_text_num_bool == True]
kurt_final_text = kurt_final_text[kurt_final_text['Destination'].str.match(destination_str) == True]
kurt_final_talk = kurt_talk[kurt_talk_num_bool == True]

mom_final_text = mom_text[mom_text_num_bool == True]
mom_final_text = mom_final_text[mom_final_text['Destination'].str.match(destination_str) == True]
mom_final_talk = mom_talk[mom_talk_num_bool == True]

dad_final_text = dad_text[dad_text_num_bool == True]
dad_final_text = dad_final_text[dad_final_text['Destination'].str.match(destination_str) == True]
dad_final_talk = dad_talk[dad_talk_num_bool == True]

In [18]:
kurt_final_text['Number'] = kurt_final_text['Number'].str.replace(r'\d{10,11}', format_phone)
kurt_final_talk['Number'] = kurt_final_talk['Number'].str.replace(r'\d{10,11}', format_phone)

dad_final_text['Number'] = dad_final_text['Number'].str.replace(r'\d{10,11}', format_phone)
dad_final_talk['Number'] = dad_final_talk['Number'].str.replace(r'\d{10,11}', format_phone)

mom_final_text['Number'] = mom_final_text['Number'].str.replace(r'\d{10,11}', format_phone)
mom_final_talk['Number'] = mom_final_talk['Number'].str.replace(r'\d{10,11}', format_phone)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:8]


### Initial Analysis

#### Data:
The data information is not suitable for a graph data structure. I think it would be interesting to visualize it using bar or line graphs to measure activity over time. Some questions that could be answered are:

- Which days am I more active?
- Which times throughout the day am I more active?
- What's different about days that there is more data consumption via T-Mobile's data services vice wifi?
- How has my consumption changed over time?

It's worth noting that this doesn't measure my total internet activity. It is merely seeing my data consumption through T-Mobile services. This leads to some additional inquiries:

- Does my phone track data consumption over time?
- Does Comcast track my data consumption at home and is it available to me?

#### Talk:
The are several angles to take with this data set, and much of it will be similar to __Text__. First, the graph data structure. I'd be interested in seeing total incoming and outgoing calls between me and all other nodes over the entire time period available. It could then be split into just incoming calls and just outgoing calls. I could make a dictionary of all known or easily identifiable phone numbers and make the nodes names of contacts rather than numbers. As far as node/edge weights, there's two things to consider: call frequency and call duration. This distinction is important because duration/frequency represent different measures of significance of a given contact. For example, I may have hundreds of calls to Melissa, but we usually only talk briefly to discuss logistical stuff because we are so heavily involved in each other's lives. However, I have much longer calls to my mother because she lives far away (and loves to talk). This may be a visualization problem. 

#### Text:
All the same problems of talk apply to text with exception of call duration. In addition, we know have an opportunity to graph the nodes by location (The phone number's area code really). So we could make the graph just like __Talk__. Thinking about it further, the destination isn't useful at all. Many contacts will have no affiliation (anymore) with the area code from which they first received their phone number. I think the use in the Destination column would be to have other data available by which we could compare people's actual location to the destination logged by T-Mobile.

### Additional things I'd like to do.
Looking at __Talk__, I think it would be smart to do the following:

For the graph:

- We won't care about the date and time.
- Divide the table into incoming and outgoing.
- Group by number, sum the minutes for each number, and add a 'Frequency' (Count) column to each number.

- Make a seperate graph that ignores the 'Description' distinction.

Visualization:

- I don't think it would be productive to get too granular into the time (for now), so we can further split the data weeks and months. 
- Choose the top ten most called numbers and make histograms by week and month (total, outgoing, and incoming, each). 
- Do the same as above except with call duration ('Min').

__Text__:

- Same for __Talk__, but there will be no 'Min' sum.

__Data__:

- See above for guidance.

### Other thoughts:

- I'd like to adapt the weighted graph we made in school for this project. Two issues arise with this: - It inherits from dict, thus I'd rather have it be a composition of a dict so we don't methods available that could screw up the graph. - I need to have a better understanding of graph databases to understand how this would work. 

- This graph is going to be one node(me) with a ton of leaves(my contacts). To really make this a better product, I should think about obtaining my girlfriend's bill as well as my parents. That will really make this more interesting.

In [19]:
#final_talk.head()

In [20]:
#final_text.head()

In [21]:
with open('Contacts.vcf') as f:
    splitLine = f.read().split('END:VCARD')

In [22]:
def split_it(split):
    for line in splitLine:
        split = line.split()
        for word in split:
            finished = ''
            if ';;;' in word:
                name = word.strip(';N:')
            if 'CELL' in word:
                stripped_word = word.split(':')[-1]
                groomed_num = stripped_word.strip('#+')
                if len(groomed_num) == 11:
                    finished = '(' + groomed_num[1:4] + ') ' + groomed_num[4:7] + '-' + groomed_num[7:]
                elif len(groomed_num) == 10:
                    finished = '(' + groomed_num[0:3] + ') ' + groomed_num[3:6] + '-' + groomed_num[6:]
                else:
                    continue
                yield (finished, name)

In [23]:
gen = split_it(splitLine)
contacts_basic = {tup[0]: tup[1] for tup in gen}

In [24]:
for key, value in contacts_basic.items():
    if ';' in value:
        contact = value.split(';')
        contacts_basic[key] = contact[1] + ' ' + contact[0]
        
del contacts_basic['(573) 864-5068']
del contacts_basic['(901) 874-4845']
del contacts_basic['(832) 671-6190']
del contacts_basic['(334) 524-3608']
contacts_basic['(334) 524-9020'] = 'Kurt'

In [25]:
kurt_final_talk.replace({'Number': contacts_basic}, inplace=True)
kurt_final_text.replace({'Number': contacts_basic}, inplace=True)
dad_final_talk.replace({'Number': contacts_basic}, inplace=True)
dad_final_text.replace({'Number': contacts_basic}, inplace=True)
mom_final_talk.replace({'Number': contacts_basic}, inplace=True)
mom_final_text.replace({'Number': contacts_basic}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [generic.py:3855]


#### Graph Details

- Nodes = 'Number': Talk, Text
- Relationships = 'Call' and 'Text' ('Direction' denotes direction in the graph)
- Properties = 
    - We'll add a 'Count' property to both relationships to capture the frequency of a relationship between nodes
    - We'll add a list of call durations for 'Call' relationship between nodes
    - For the more common call recipients, give them proper names rather than numbers and add the appropriate relationship between them (Melissa -> girlfriend -> Kurt)



In [26]:
def unique_and_concat(talk, text):
    concated = np.concatenate([talk, text])
    concated = np.unique(concated)
    return concated      

In [27]:
kurt_talk_array = kurt_final_talk['Number'].values
kurt_text_array = kurt_final_text['Number'].values
mom_talk_array = mom_final_talk['Number'].values
mom_text_array = mom_final_text['Number'].values
dad_talk_array = dad_final_talk['Number'].values
dad_text_array = dad_final_text['Number'].values

kurt_unique_numbers = unique_and_concat(kurt_talk_array, kurt_text_array)
mom_unique_numbers = unique_and_concat(mom_talk_array, mom_text_array)
dad_unique_numbers = unique_and_concat(dad_talk_array, dad_text_array)

concated_nums = np.unique(np.concatenate([kurt_unique_numbers, mom_unique_numbers, dad_unique_numbers]))

In [31]:
from src.labeled_property_graph import LabeledPropertyGraph
from collections import namedtuple

def add_nodes(array, lpg):
    """
    Adds a numpy array containing objects intended to be nodes in the graph.
    """
    for something in array:
        lpg.add_node(something)
    
    # Need to add functionality to add labels to each node.


def text_direction_generator(array):
    """
    Makes a generator that yields namedtuples for adding nodes to the graph.
    """
    Texts = namedtuple('Texts', ('number', 'direction'))
    for node, direction in array:
        yield Texts(node, direction)


def talk_direction_generator(array):
    """
    Makes a generator that yields namedtuples for adding nodes to the lpg.
    """
    Talk = namedtuple('Talk', ('number', 'direction', 'duration'))
    for node, direction, duration in array:
        yield Talk(node, direction, duration)
        

def add_text_relationships(text_gen, lpg, name):
    """
    Checks if a text relationship exists between Kurt and the number,
    if it does, increments the count property on the relationship. 
    If it doesn't, adds the relationship in the given direction and
    creates the count property on the relationship.
    
    The way I'm having to do this suggests refactoring the lpg class.
    """
    for relationship in text_gen:
        if relationship.direction == 'Incoming':
            arguments = ['Text', relationship.number, name]
        else:
            arguments = ['Text', name, relationship.number]
        try:
            lpg.add_relationship(*arguments)
        except ValueError:
            try:
                count = lpg.get_relationship_properties(*arguments)
                count['Count'] += 1
            except KeyError:
                lpg.add_rel_props(*arguments, Count=1)

def add_talk_relationships(talk_gen, lpg, name):
    """
    Adds a 'Talk' relationship between nodes.
    If the relationships already exists, increments the count property on the relationship.
    If the count property doesn't exist, creates it.
    Adds the duration property to each relationship, or adds the duration to the list
    on each property.
    """
    for relationship in talk_gen:
        if relationship.direction == 'Incoming':
            arguments = ['Talk', relationship.number, name]
        else:
            arguments = ['Talk', name, relationship.number]
        try:
            lpg.add_relationship(*arguments)
        except ValueError:
            try:
                count = lpg.get_relationship_properties(*arguments)
                count['Count'] += 1
                count['Duration'].append(relationship.duration)
            except KeyError:
                lpg.add_rel_props(*arguments, Count=1, Duration=[relationship.duration])

In [34]:
phone_lpg = LabeledPropertyGraph()

In [35]:
add_nodes(concated_nums, phone_lpg)

In [36]:
kurt_text_array_two = kurt_final_text[['Number', 'Direction']].values
kurt_talk_array_two = kurt_final_talk[['Number', 'Description', 'Min']].values
dad_text_array_two = dad_final_text[['Number', 'Direction']].values
dad_talk_array_two = dad_final_talk[['Number', 'Description', 'Min']].values
mom_text_array_two = mom_final_text[['Number', 'Direction']].values
mom_talk_array_two = mom_final_talk[['Number', 'Description', 'Min']].values

In [37]:
kurt_text_gen = text_direction_generator(kurt_text_array_two)
kurt_talk_gen = talk_direction_generator(kurt_talk_array_two)
dad_text_gen = text_direction_generator(dad_text_array_two)
dad_talk_gen = talk_direction_generator(dad_talk_array_two)
mom_text_gen = text_direction_generator(mom_text_array_two)
mom_talk_gen = talk_direction_generator(mom_talk_array_two)

In [38]:
add_text_relationships(kurt_text_gen, phone_lpg, 'Kurt')
add_text_relationships(dad_text_gen, phone_lpg, 'Dad')
add_text_relationships(mom_text_gen, phone_lpg, 'Mom')

In [39]:
add_talk_relationships(kurt_talk_gen, phone_lpg, 'Kurt')
add_talk_relationships(dad_talk_gen, phone_lpg, 'Dad')
add_talk_relationships(mom_talk_gen, phone_lpg, 'Mom')

Note: the file was improperly formatted so I was unable to get past an error using two different vcf readers. I finally settled on parsing them manually.

Things to do:

- Generate colors for each node depending on its value (if it's not a name, make it grey)
- Generate a size for each node depending on the number of neighbors it has (in this case, my node will be super huge)
- Determine how to add multiple and directed links between nodes in D3.js

In [40]:
def get_nodes_and_rels():
    for node in phone_lpg.nodes():
        #get_neighbors
        #get_relationships
        #get_node_properties
        #get_relationship_properties
        neighbors = phone_lpg.get_neighbors(node)
        neighb_rels = []
        for neighbor in neighbors:
            relationships = phone_lpg.get_relationships(node, neighbor)
            neighb_rels.append(relationships)
        node_props = phone_lpg.get_node_properties(node)
        relationship_props = {}
        neighb_tot = {}
        neighbor_and_rels = zip(neighbors, neighb_rels)
        for neighb, rels in neighbor_and_rels:
            neighb_tot[neighb] = {}
            for rel in rels:
                neighb_tot[neighb][rel] = phone_lpg.get_relationship_properties(rel, node, neighb)
        yield node, neighb_tot

In [54]:
def _node_color(node):
    if node.startswith('('):
        return 'grey'
    elif node == 'Kurt':
        return 'blue'
    else:
        return 'green'

def _node_distance(node, other):
    if node.startswith('(') or other.startswith('('):
        return 20
    else:
        return 20

def create_json_ready_thing(gen):
    genny = gen()
    ultimate_dict = {"nodes": [], "links": {"Text": [], "Talk": []}}
    for node, dicty in genny:
        num_neighbs = len(dicty)
        num_neighbs = num_neighbs if num_neighbs > 0 else 1
        radius = np.log2(num_neighbs) * 2
        radius = radius if radius > 2 else 2
        color = _node_color(node)
        node_dict = {"id": node, "color": color, "radius": radius}
        ultimate_dict["nodes"].append(node_dict)
        
        for key, value in dicty.items():
            target = key
            attr = _node_distance(node, key)
            if 'Talk' in value and 'Text' in value:
                name_text = 'Text'
                name_talk = 'Talk'
                text_prop = value['Text']
                talk_prop = value['Talk']
                text_dict = {"name": name_text, "source": node, "target": target, "value": attr, "properties": text_prop}
                ultimate_dict["links"]["Text"].append(text_dict)
                talk_dict = {"name": name_talk, "source": node, "target": target, "value": attr, "properties": talk_prop}
                ultimate_dict["links"]["Talk"].append(talk_dict)
            else:
                try:
                    text_prop = value['Text']
                    name_text = 'Text'
                    text_dict = {"name": name_text, "source": node, "target": target, "value": attr, "properties": text_prop}
                    ultimate_dict["links"]["Text"].append(text_dict)
                except KeyError:
                    talk_prop = value['Talk']
                    name_talk = 'Talk'
                    talk_dict = {"name": name_talk, "source": node, "target": target, "value": attr, "properties": talk_prop}
                    ultimate_dict["links"]["Talk"].append(talk_dict)
            
    return ultimate_dict

In [55]:
ready = create_json_ready_thing(get_nodes_and_rels)

In [56]:
# graph_stuff = {"nodes": [
#     {"id": "Kurt", "color": "blue", "radius": 15},
#     {"id": "Melissa", "color": "purple", "radius": 5},
#     {"id": "Megan", "color": "green", "radius": 5},
#     {"id": "Suman", "color": "orange", "radius": 5},
#     {"id": "Nina", "color": "pink", "radius": 5}
# ],
# "links": {
#     "Talk": [
#  {"Name": "Talk, "source": "Kurt", "target": "Melissa", "value": "black", "properties": [{"Count": 123415, "Duration": [15, 32]]},
#  {"Name": "Talk, "source": "Melissa", "target": "Megan", "value": "black"},
#  {"Name": "Talk, "source": "Megan", "target": "Suman", "value": "black"},
#  {"Name": "Talk, "source": "Melissa", "target": "Suman", "value": "black"},
#  {"Name": "Talk, "source": "Melissa", "target": "Nina", "value": "black"}
# ],
#    "Text": [
#  {"source": "Kurt", "target": "Melissa", "value": "black", "properties": {"Count": 15321}},
#  {"source": "Melissa", "target": "Megan", "value": "black"},
#  {"source": "Megan", "target": "Suman", "value": "black"},
#  {"source": "Melissa", "target": "Suman", "value": "black"},
#  {"source": "Melissa", "target": "Nina", "value": "black"}
# ]}

In [57]:
import json
with open('../js/mega_phone_graph.json', 'w') as f:
    json.dump(ready, f)