In [30]:
# list of users, each represented by a dict that contains for each user's id and name
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

# list of friendships as a list of pairs of IDs
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
               (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

# make an empty list of friends to each user, set each user's friends property to an empty list
for user in users:
    user["friends"] = []

# populate the friends list using friendships data
for i, j in friendships:
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

users

[{'id': 0,
  'name': 'Hero',
  'friends': [{'id': 1,
    'name': 'Dunn',
    'friends': [{...},
     {'id': 2,
      'name': 'Sue',
      'friends': [{...},
       {...},
       {'id': 3,
        'name': 'Chi',
        'friends': [{...},
         {...},
         {'id': 4,
          'name': 'Thor',
          'friends': [{...},
           {'id': 5,
            'name': 'Clive',
            'friends': [{...},
             {'id': 6,
              'name': 'Hicks',
              'friends': [{...},
               {'id': 8,
                'name': 'Kate',
                'friends': [{...},
                 {'id': 7, 'name': 'Devin', 'friends': [{...}, {...}]},
                 {'id': 9, 'name': 'Klein', 'friends': [{...}]}]}]},
             {'id': 7,
              'name': 'Devin',
              'friends': [{...},
               {'id': 8,
                'name': 'Kate',
                'friends': [{'id': 6,
                  'name': 'Hicks',
                  'friends': [{...}, {...}]},
        

In [31]:

def number_of_friends(user):
    return len(user["friends"])

total_connections = sum(number_of_friends(user) for user in users)
total_connections # should be 24

24

In [32]:
from __future__ import division
num_users = len(users)

avg_connections = total_connections / num_users
avg_connections

2.4

In [33]:
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users] # tuple of id and their number of friends

sorted(num_friends_by_id, key=lambda x:x[1], reverse=True) # sort them from most friends to least friends

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

In [34]:
# find friends of a friend
def friends_of_friend_ids_bad(user):
    return [foaf["id"] for friends in user["friends"] for foaf in friends["friends"]]
        
user_0_foaf = friends_of_friend_ids_bad(users[0])
print(user_0_foaf) # 0 is friends with 1, 2, 1 is friends with 0, 2, 3, 2 is friends with 0, 1, 3

[0, 2, 3, 0, 1, 3]


In [35]:
from collections import Counter

# checks if user's id and other user's id is the same
def not_the_same(user, other_user):
    return user["id"] != other_user["id"]

# checks the list of the user's friends, and if none of them are the same, they are not friends
def not_friends(user, other_user):
    return all(not_the_same(friend, other_user) for friend in user["friends"])

In [36]:
# mutual friends
# for each of my friends, I count their friends who aren't me and aren't my friends
def friends_of_friend_ids(user):
    return Counter(foaf["id"] for friend in user["friends"] for foaf in friend["friends"] if not_the_same(user, foaf) and not_friends(user, foaf)) 

for i in range(10):
    print(friends_of_friend_ids(users[i])) # user 3 has 2 mutuals with user 0, and 1 with user 5

Counter({3: 2})
Counter({4: 1})
Counter({4: 1})
Counter({0: 2, 5: 1})
Counter({1: 1, 2: 1, 6: 1, 7: 1})
Counter({8: 2, 3: 1})
Counter({7: 2, 4: 1, 9: 1})
Counter({6: 2, 4: 1, 9: 1})
Counter({5: 2})
Counter({6: 1, 7: 1})


In [37]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [60]:
def data_scientists_who_like(target_interest):
    for user_id, user_interest in interests:
        if user_interest == target_interest:
            return user_id
        
from collections import defaultdict

# keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

# returns interest as key, and user_id with that interest
for user_id, user_interest in interests:
    user_ids_by_interest[user_interest].append(user_id)

interests_by_user_id = defaultdict(list)

# returns user_id as key, lists interests of that user
for user_id, user_interest in interests:
    interests_by_user_id[user_id].append(user_interest)

user_ids_by_interest


defaultdict(list,
            {'Hadoop': [0, 9],
             'Big Data': [0, 8, 9],
             'HBase': [0, 1],
             'Java': [0, 5, 9],
             'Spark': [0],
             'Storm': [0],
             'Cassandra': [0, 1],
             'NoSQL': [1],
             'MongoDB': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'numpy': [2],
             'statsmodels': [2],
             'pandas': [2],
             'R': [3, 5],
             'statistics': [3, 6],
             'regression': [3, 4],
             'probability': [3, 6],
             'machine learning': [4, 7],
             'decision trees': [4],
             'libsvm': [4],
             'C++': [5],
             'Haskell': [5],
             'programming languages': [5],
             'mathematics': [6],
             'theory': [6],
             'Mahout': [7],
             'neural networks': [7, 8],
             'deep learning': 

In [46]:
# who has the most interests in common?

def most_common(user):
    for interest in interests_by_user_id[user["id"]]:
        for interested_user_id in user_ids_by_interest[interest]:
            if interested_user_id != user["id"]:
                return Counter(interested_user_id)

In [47]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [54]:
salary_by_tenure = defaultdict(list)
# keys are tenure, the values are list of salaries
for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

# for each tenure, it appends the average salary for that specific tenure
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries) for tenure, salaries in salary_by_tenure.items()
}

dict(sorted(average_salary_by_tenure.items(), key=lambda x:x[0]))

{0.7: 48000.0,
 1.9: 48000.0,
 2.5: 60000.0,
 4.2: 63000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 8.1: 88000.0,
 8.7: 83000.0,
 10: 83000.0}

In [55]:
# bucket the tenures

def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"
    
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

salary_by_tenure_bucket

defaultdict(list,
            {'more than five': [83000, 88000, 76000, 69000, 76000, 83000],
             'less than two': [48000, 48000],
             'between two and five': [60000, 63000]})

In [59]:
average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

dict(sorted(average_salary_by_bucket.items(), key=lambda x:x[0]))

{'between two and five': 61500.0,
 'less than two': 48000.0,
 'more than five': 79166.66666666667}