Chapter 1 - Notes

In [11]:
users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"}
]

friendships = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4),(4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]

To add a list of friends to each user, we create a 'friends' property for each user as an empty list

In [12]:
for user in users:
    user["friends"] = []
    
users

[{'friends': [], 'id': 0, 'name': 'Hero'},
 {'friends': [], 'id': 1, 'name': 'Dunn'},
 {'friends': [], 'id': 2, 'name': 'Sue'},
 {'friends': [], 'id': 3, 'name': 'Chi'},
 {'friends': [], 'id': 4, 'name': 'Thor'},
 {'friends': [], 'id': 5, 'name': 'Clive'},
 {'friends': [], 'id': 6, 'name': 'Hicks'},
 {'friends': [], 'id': 7, 'name': 'Devin'},
 {'friends': [], 'id': 8, 'name': 'Kate'},
 {'friends': [], 'id': 9, 'name': 'Klein'}]

In [13]:
users[0]

{'friends': [], 'id': 0, 'name': 'Hero'}

In [14]:
users[7]

{'friends': [], 'id': 7, 'name': 'Devin'}

In [15]:
users[7]['friends']

[]

To populate the 'friends' list property of each user, we use the friendships data. 

In [16]:
for i,j in friendships:
    users[i]['friends'].append(users[j]) # add i as a friend of j
    users[j]['friends'].append(users[i]) # also add j as a friend of i
    
len(users[0]['friends'])

2

In [17]:
len(users[8]['friends']) # number of friends Kate has.

3

To calculate total connections, we determine the length of 'friends' property of each user

In [18]:
def number_of_friends(user):
    return len(user['friends'])

total_connections = sum(number_of_friends(user) for user in users) # list comprehensions
total_connections

24

In [19]:
from __future__ import division
avg_connections = total_connections/len(users)
avg_connections

2.4

To find the most connected users, we can sort them since it is a small data

In [20]:
num_friends_by_id = [(user['id'], number_of_friends(user)) for user in users] # a list of users by id

sorted(num_friends_by_id, key=lambda(user_id, num_friends): num_friends, reverse=True) 

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

To count friends of friend (mutual friends)

In [21]:
from collections import Counter

def not_the_same(user, other_user):
    return user['id']!= other_user['id']

def not_friends(user, other_user):
    return all(not_the_same(friend, other_user) for friend in user['friends'])

def friends_of_friend_ids(user):
    return Counter(foaf['id'] 
                   for friend in user['friends']  #for each of my friends
                   for foaf in friend['friends']  #count their friends
                   if not_the_same(user, foaf)    #who are not me
                   and not_friends(user, foaf))   #and are not my friends


friends_of_friend_ids(users[3])

Counter({0: 2, 5: 1})

This shows Chi (id = 3) has two mutual friends with Hero (id =0) and only one mutual friend with Clive (id = 5).

In [22]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

To create a collection of users with similar interests

In [23]:
from collections import defaultdict

user_ids_by_interest = defaultdict(list) #keys are interests, values are user_ids with that interest

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

interests_by_user_id = defaultdict(list) #keys are user_ids, values are lists of interests for that user_id

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)


In [24]:
user_ids_by_interest.items()

[('Java', [0, 5, 9]),
 ('neural networks', [7, 8]),
 ('NoSQL', [1]),
 ('Hadoop', [0, 9]),
 ('Mahout', [7]),
 ('Storm', [0]),
 ('regression', [3, 4]),
 ('statistics', [3, 6]),
 ('probability', [3, 6]),
 ('programming languages', [5]),
 ('Python', [2, 3, 5]),
 ('deep learning', [8]),
 ('Haskell', [5]),
 ('mathematics', [6]),
 ('Spark', [0]),
 ('numpy', [2]),
 ('pandas', [2]),
 ('artificial intelligence', [8]),
 ('theory', [6]),
 ('libsvm', [4]),
 ('C++', [5]),
 ('R', [3, 5]),
 ('HBase', [0, 1]),
 ('Postgres', [1]),
 ('decision trees', [4]),
 ('Big Data', [0, 8, 9]),
 ('MongoDB', [1]),
 ('scikit-learn', [2, 7]),
 ('MapReduce', [9]),
 ('machine learning', [4, 7]),
 ('scipy', [2]),
 ('statsmodels', [2]),
 ('Cassandra', [0, 1])]

In [25]:
interests_by_user_id.items()

[(0, ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']),
 (1, ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']),
 (2, ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']),
 (3, ['R', 'Python', 'statistics', 'regression', 'probability']),
 (4, ['machine learning', 'regression', 'decision trees', 'libsvm']),
 (5, ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']),
 (6, ['statistics', 'probability', 'mathematics', 'theory']),
 (7, ['machine learning', 'scikit-learn', 'Mahout', 'neural networks']),
 (8,
  ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']),
 (9, ['Hadoop', 'Java', 'MapReduce', 'Big Data'])]

In [26]:
df = defaultdict(list)
df

defaultdict(list, {})

To find who has the most common interest to a given user

In [29]:
def most_common_interests_with(user):
    return Counter(given_user_id 
                   for interest in interests_by_user_id[user['id']] #iterate over given user's interests
                   for given_user_id in user_ids_by_interest[interest] #for each interest count other users with that interest
                   if given_user_id != user['id']) #that is not me

most_common_interests_with(users[0])

Counter({1: 2, 5: 1, 8: 1, 9: 3})

In [30]:
most_common_interests_with(users[9])

Counter({0: 3, 5: 1, 8: 1})

In [31]:
most_common_interests_with(users[8])

Counter({0: 1, 7: 1, 9: 1})

In [32]:
most_common_interests_with(users[7])

Counter({2: 1, 4: 1, 8: 1})

In [33]:
salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1),
    (48000, 0.7), (76000, 6),
    (69000, 6.5), (76000, 7.5),
    (60000, 2.5), (83000, 10),
    (48000, 1.9), (63000, 4.2)
]