In [0]:
# Chapter 1 notes and code
# Basic definition of data scientist - someone who extracts insights from messy data.
# We'll be building all the tools from scratch in this book in order to better understand the foundations
# of data science

# Let's do an example of "key connectors" of a list of data scientists
users = [ 
        { "id": 0, "name": "Hero"},
        { "id": 1, "name": "Dunn"},
        { "id": 2, "name": "Sue"},
        { "id": 3, "name": "Chi"},
        { "id": 4, "name": "Thor"},
        { "id": 5, "name": "Clive"},
        { "id": 6, "name": "Hicks"},
        { "id": 7, "name": "Devin"},
        { "id": 8, "name": "Kate"},
        { "id": 9, "name": "Klein"},
]

In [0]:
# Here's who knows who
friendship_pairs = [(0,1), (0,2), (1,2), (1,3), (2,3), (3,4),
                    (4,5), (5,6), (5,7), (6,8), (7,8), (8,9)]

In [0]:
# Initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}

In [0]:
# And loop over the friendship pairs to populate it:
for i, j in friendship_pairs:
  friendships[i].append(j) # Add j as a friend of user i
  friendships[j].append(i) # Add i as a friend of user j

In [0]:
# Let's find the total number of connections
def number_of_friends(user):
  """How many friends does _user_ have?"""
  user_id = user["id"]
  friend_ids = friendships[user_id]
  return len(friend_ids)

total_connections = sum(number_of_friends(user) for user in users) #24

In [0]:
# Let's find the average number of connections in our data
num_users = len(users) # length of users list
avg_connections = total_connections / num_users # 24/10 == 2.4

In [0]:
# Let's sort the list by people with the most friends to least friends
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
num_friends_by_id.sort(
    key=lambda id_and_friends: id_and_friends[1],
    reverse=True)

In [0]:
# Friends of friends scenario
# Here's a bad way to to that
def foaf_ids_bad(user):
  """foaf is short for "friend of a friend" """
  return [foaf_id
          for friend_id in friendships[user["id"]]
          for foaf_id in friendships[friend_id]]

In [11]:
foaf_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [16]:
# Here's a more accurate way to do it
from collections import Counter # not loaded by default

def friends_of_friends(user):
  user_id = user["id"]
  return Counter(
      foaf_id
      for friend_id in friendships[user_id]   # For each of my friends,
      for foaf_id in friendships[friend_id]   # find their friends
      if foaf_id != user_id                   # who aren't me
      and foaf_id not in friendships[user_id] # and aren't my friends
  )

print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


In [0]:
# Let's start to connect people with similar interests
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [0]:
def data_scientists_who_like(target_interest):
  """Find the ids of all users who like the targer interest."""
  return [user_id
          for user_id, user_interest in interests
          if user_interest == target_interest]

In [0]:
# The previous function is exhaustive, so lets build an index from interests to users
from collections import defaultdict

# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
  user_ids_by_interest[interest].append(user_id)

# And another from users to interests
# Keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
  interests_by_user_id[user_id].append(interest)