The below input initializes the users and friendships vars, and then processes them such that friendships are now represented as elements in the users data structure.

In [11]:
users = [
    {"id": 0, "name": "Hero"},
    {"id": 1, "name": "Dunn"},
    {"id": 2, "name": "Sue"},
    {"id": 3, "name": "Chi"},
    {"id": 4, "name": "Thor"},
    {"id": 5, "name": "Clive"},
    {"id": 6, "name": "Hicks"},
    {"id": 7, "name": "Devin"},
    {"id": 8, "name": "Kate"},
    {"id": 9, "name": "Klein"}
]

friendships = [(0, 1), (0,2), (1,2), (1,3), (2,3), (3,4), (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)]

for user in users:
    user["friends"] = []

for i, j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])


The below input creates a function that defines the number of friends each user has, and then determines the total number of connections, average number of connections, and the number of friends by ID, shown in the output.

In [60]:
def number_of_friends(user):
    """how many friends does _user_ have?"""
    return len(user["friends"])

total_connections = sum(number_of_friends(user) for user in users)

from __future__ import division
num_users = len(users)
avg_connections = total_connections / num_users
avg_connections_str = "average connections = " + str(avg_connections)

num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
output = sorted(num_friends_by_id, key=lambda user_id_num_friends:(-user_id_num_friends[0], user_id_num_friends[1]), reverse=True)

result = [avg_connections_str,
          "number of friends by id = " + str(output)]
result

average connections = 2.4


['average connections = 2.4',
 'number of friends by id = [(0, 2), (1, 3), (2, 3), (3, 3), (4, 2), (5, 3), (6, 2), (7, 2), (8, 3), (9, 1)]']

The below input creates a function to find out friends of friends.  Here we run it on `user[0]`.

In [22]:
def friends_of_friends_ids_bad(user):
    return [foaf["id"]
            for friend in user["friends"]
            for foaf in friend["friends"]]

friends_of_friends_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

The below input shows that the above output doesn't discriminate between like friends, so we'll have to refactor the function.

In [65]:
result = [[friend["id"] for friend in users[0]["friends"]],
          [friend["id"] for friend in users[1]["friends"]],
          [friend["id"] for friend in users[2]["friends"]]]
result

[[1, 2], [0, 2, 3], [0, 1, 3]]

The below output modifies the above function `friends_of_friends_ids_bad()` to get a count of mutual friends.

In [66]:
from collections import Counter

def not_the_same(user, other_user):
    """two users are not the same if they have different ids"""
    return user["id"] != other_user["id"]

def not_friends(user, other_user):
    """other_user is not a friend of he's not in user["friends"]; 
    that is, if he's not_the_same as all the people in user["friends"]"""
    return all(not_the_same(friend, other_user) for friend in user["friends"])

def friends_of_friend_ids(user):
    return Counter(foaf["id"] for friend in user["friends"] for foaf in friend["friends"]
                   if not_the_same(user, foaf) and not_friends(user, foaf))

friends_of_friend_ids(users[3])

Counter({0: 2, 5: 1})

Effectively this means that `user[3]` has two mutual friends with `user[0]` and one mutual friend with `user[5]`.

Below, we'll expand by initializing the interests var.

In [37]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"), (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"), (1, "Postgres"), (2, "Python"),(2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"), (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision tree"), (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"), (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"), (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"), (9, "Java"), (9, "Mapreduce"), (9, "Big Data")
]


And then we'll create the `data_scientists_who_like()` function.

In [38]:
def data_scientists_who_like(target_interest):
    return [user_id for user_id, user_interest in interests
            if user_interest == target_interest]

data_scientists_who_like("Big Data")

[0, 8, 9]

This needs a little bit of work to get meaningful data, so we create a couple of new dicts; `user_ids_by_interest` and `interests_by_user_id`.

In [42]:
from collections import defaultdict

user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)
    
def most_common_interests_with(user):
    return Counter(interested_user_id for interest in interests_by_user_id[user["id"]]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user["id"])

most_common_interests_with(users[4])

Counter({3: 1, 7: 1})

Next, we'll add information on salaries and experience.

In [44]:
salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1), (48000, 0.7), (76000, 6), (69000, 6.5), (76000, 7.5), (60000, 2.5), (83000, 10), (48000, 1.9),
    (63000, 4.2)
]

The below input creates the `salary_by_tenure` list, and processes it so that it's got the proper keys.  Next, a lambda is created that handles the mathematical logic to get the average.

In [67]:
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

average_salary_by_tenure

{0.7: 48000.0,
 1.9: 48000.0,
 2.5: 60000.0,
 4.2: 63000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 8.1: 88000.0,
 8.7: 83000.0,
 10: 83000.0}

Obviously, this isn't that meaningful, so we'll need to massage the tenure values into buckets.

In [68]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"
    
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
    
average_salary_by_bucket =  {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

average_salary_by_bucket

{'between two and five': 61500.0,
 'less than two': 48000.0,
 'more than five': 79166.66666666667}

The below input creates a function that predicts whether or not a user has a paid account.  This is a very rudementary device and I think it was designed only to demonstrate a high level overview of what the book will teach you.

In [69]:
def predict_paid_or_unpaid(years_experience):
    if years_experience < 3.0:
        return "paid"
    elif years_experience < 8.5:
        return "unpaid"
    else:
        return "paid"

predict_paid_or_unpaid(salaries_and_tenures[0][0])

'paid'

Next, we'll create a word count function.

In [76]:
words_and_counts = Counter(word for user, interest in interests
                           for word in interest.lower().split())

result = []
for word, count in words_and_counts.most_common():
    if count > 1:
        result.append([word, count])
        
result
        


[['big', 3],
 ['data', 3],
 ['java', 3],
 ['python', 3],
 ['learning', 3],
 ['hadoop', 2],
 ['hbase', 2],
 ['cassandra', 2],
 ['scikit-learn', 2],
 ['r', 2],
 ['statistics', 2],
 ['regression', 2],
 ['probability', 2],
 ['machine', 2],
 ['neural', 2],
 ['networks', 2]]