In [1]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

In [2]:
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
                    (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [3]:
# Initialize the dict with an empty list for each user id:
friendships = {user["id"]: [] for user in users}
friendships

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}

In [4]:
# And loop over the friendship pairs to populate it:
for i, j in friendship_pairs:
    friendships[i].append(j)  # Add j as a friend of user i
    friendships[j].append(i)  # Add i as a friend of user j

In [5]:
friendships

{0: [1, 2],
 1: [0, 2, 3],
 2: [0, 1, 3],
 3: [1, 2, 4],
 4: [3, 5],
 5: [4, 6, 7],
 6: [5, 8],
 7: [5, 8],
 8: [6, 7, 9],
 9: [8]}

In [6]:
def number_of_friends(user):
    """How many friends does _user_ have?"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    return len(friend_ids)

In [7]:
nf = number_of_friends(users[1])
nf

3

In [8]:
friendships[1]

[0, 2, 3]

In [9]:
for _ in range(0, len(users)):
    print(_)

0
1
2
3
4
5
6
7
8
9


In [10]:
total_connections = sum(number_of_friends(user) for user in users)        # 24
total_connections

24

In [11]:
"""
The Python assert statement checks whether a condition is true. If the condition is false, 
it raises an AssertionError (optionally with a custom error message).
"""
assert total_connections == 24

In [12]:
num_users = len(users)   
num_users

10

In [13]:
avg_connections = total_connections / num_users  # 24 / 10 == 2.4
avg_connections

2.4

In [14]:
assert num_users == 10
assert avg_connections == 2.4

In [15]:
# Create a list (user_id, number_of_friends).
[(user["id"], number_of_friends(user)) for user in users]

[(0, 2),
 (1, 3),
 (2, 3),
 (3, 3),
 (4, 2),
 (5, 3),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 1)]

In [16]:
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
num_friends_by_id

[(0, 2),
 (1, 3),
 (2, 3),
 (3, 3),
 (4, 2),
 (5, 3),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 1)]

In [17]:
# Sort the list by num_friends largest to smallest
num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True)
num_friends_by_id

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

In [18]:
assert num_friends_by_id[0][1] == 3     # several people have 3 friends
assert num_friends_by_id[-1] == (9, 1)  # user 9 has only 1 friend

In [19]:
def foaf_ids_bad(user):
    """foaf is short for "friend of a friend" """
    return [foaf_id
            for friend_id in friendships[user["id"]] 
            for foaf_id in friendships[friend_id]]

In [20]:
foaf_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [21]:
# foaf for all users
[(user["id"], foaf_ids_bad(user)) for user in users]

[(0, [0, 2, 3, 0, 1, 3]),
 (1, [1, 2, 0, 1, 3, 1, 2, 4]),
 (2, [1, 2, 0, 2, 3, 1, 2, 4]),
 (3, [0, 2, 3, 0, 1, 3, 3, 5]),
 (4, [1, 2, 4, 4, 6, 7]),
 (5, [3, 5, 5, 8, 5, 8]),
 (6, [4, 6, 7, 6, 7, 9]),
 (7, [4, 6, 7, 6, 7, 9]),
 (8, [5, 8, 5, 8, 8]),
 (9, [6, 7, 9])]

In [22]:
assert foaf_ids_bad(users[0]) == [0, 2, 3, 0, 1, 3]

In [23]:
from collections import Counter

In [24]:
# For each of my friends, find their friends who aren't me and aren't my friends.
def friends_of_friends(user):
    user_id = user["id"]
    return Counter(
        foaf_id 
        for friend_id in friendships[user_id]        # For each of my friends,
        for foaf_id in friendships[friend_id]        # find their friends
        if foaf_id != user_id                        # who aren't me
        and foaf_id not in friendships[user_id]      # and aren't my friends.
    )

In [25]:
friendships[3]

[1, 2, 4]

In [26]:
friends_of_friends(users[3])

Counter({0: 2, 5: 1})

In [27]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [28]:
def data_scientists_who_like(target_interest):
    """Find the ids of all users who like the target interest."""
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

In [29]:
data_scientists_who_like("Hadoop")

[0, 9]

In [30]:
data_scientists_who_like("Big Data")

[0, 8, 9]

In [31]:
from collections import defaultdict

A defaultdict behaves like a regular dictionary but allows you to specify a default value type (e.g., int, list, set) for keys that don't exist. When you access a missing key, it initializes it with the default value instead of raising a KeyError.

In [32]:
# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)
user_ids_by_interest

defaultdict(list, {})

In [33]:
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [34]:
user_ids_by_interest

defaultdict(list,
            {'Hadoop': [0, 9],
             'Big Data': [0, 8, 9],
             'HBase': [0, 1],
             'Java': [0, 5, 9],
             'Spark': [0],
             'Storm': [0],
             'Cassandra': [0, 1],
             'NoSQL': [1],
             'MongoDB': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'numpy': [2],
             'statsmodels': [2],
             'pandas': [2],
             'R': [3, 5],
             'statistics': [3, 6],
             'regression': [3, 4],
             'probability': [3, 6],
             'machine learning': [4, 7],
             'decision trees': [4],
             'libsvm': [4],
             'C++': [5],
             'Haskell': [5],
             'programming languages': [5],
             'mathematics': [6],
             'theory': [6],
             'Mahout': [7],
             'neural networks': [7, 8],
             'deep learning': 

In [35]:
# Keys are user_ids, values are lists of interests for that user_id.
interests_by_user_id = defaultdict(list)
interests_by_user_id

defaultdict(list, {})

In [36]:
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

In [37]:
interests_by_user_id

defaultdict(list,
            {0: ['Hadoop',
              'Big Data',
              'HBase',
              'Java',
              'Spark',
              'Storm',
              'Cassandra'],
             1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
             2: ['Python',
              'scikit-learn',
              'scipy',
              'numpy',
              'statsmodels',
              'pandas'],
             3: ['R', 'Python', 'statistics', 'regression', 'probability'],
             4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
             5: ['Python',
              'R',
              'Java',
              'C++',
              'Haskell',
              'programming languages'],
             6: ['statistics', 'probability', 'mathematics', 'theory'],
             7: ['machine learning',
              'scikit-learn',
              'Mahout',
              'neural networks'],
             8: ['neural networks',
              'deep learning',
       

In [38]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

In [39]:
most_common_interests_with(users[0])

Counter({9: 3, 1: 2, 8: 1, 5: 1})

In [40]:
# lets verify the result
interests_by_user_id[0]

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']

In [41]:
user_ids_by_interest["Hadoop"]

[0, 9]

In [42]:
user_ids_by_interest["Big Data"]

[0, 8, 9]

In [43]:
user_ids_by_interest["HBase"]

[0, 1]

In [44]:
user_ids_by_interest["Java"]

[0, 5, 9]

In [45]:
user_ids_by_interest["Spark"]

[0]

In [46]:
user_ids_by_interest["Storm"]

[0]

In [47]:
user_ids_by_interest["Cassandra"]

[0, 1]

In [48]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [49]:
# Keys are years, values are lists of the salaries for each tenure.
salary_by_tenure = defaultdict(list)
salary_by_tenure

defaultdict(list, {})

In [50]:
for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

In [51]:
salary_by_tenure

defaultdict(list,
            {8.7: [83000],
             8.1: [88000],
             0.7: [48000],
             6: [76000],
             6.5: [69000],
             7.5: [76000],
             2.5: [60000],
             10: [83000],
             1.9: [48000],
             4.2: [63000]})

In [52]:
# Keys are years, each value is average salary for that tenure.
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [53]:
average_salary_by_tenure

{8.7: 83000.0,
 8.1: 88000.0,
 0.7: 48000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 2.5: 60000.0,
 10: 83000.0,
 1.9: 48000.0,
 4.2: 63000.0}

In [54]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [55]:
# Keys are tenure buckets, values are lists of salaries for that bucket.
salary_by_tenure_bucket = defaultdict(list)

In [56]:
for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

In [57]:
salary_by_tenure_bucket

defaultdict(list,
            {'more than five': [83000, 88000, 76000, 69000, 76000, 83000],
             'less than two': [48000, 48000],
             'between two and five': [60000, 63000]})

In [58]:
def predict_paid_or_unpaid(years_experience):
  if years_experience < 3.0:
    return "paid"
  elif years_experience < 8.5:
    return "unpaid"
  else:
    return "paid"

In [59]:
predict_paid_or_unpaid(2)

'paid'

In [60]:
predict_paid_or_unpaid(6)

'unpaid'

In [61]:
predict_paid_or_unpaid(10)

'paid'

In [62]:
words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

In [63]:
words_and_counts

Counter({'big': 3,
         'data': 3,
         'java': 3,
         'python': 3,
         'learning': 3,
         'hadoop': 2,
         'hbase': 2,
         'cassandra': 2,
         'scikit-learn': 2,
         'r': 2,
         'statistics': 2,
         'regression': 2,
         'probability': 2,
         'machine': 2,
         'neural': 2,
         'networks': 2,
         'spark': 1,
         'storm': 1,
         'nosql': 1,
         'mongodb': 1,
         'postgres': 1,
         'scipy': 1,
         'numpy': 1,
         'statsmodels': 1,
         'pandas': 1,
         'decision': 1,
         'trees': 1,
         'libsvm': 1,
         'c++': 1,
         'haskell': 1,
         'programming': 1,
         'languages': 1,
         'mathematics': 1,
         'theory': 1,
         'mahout': 1,
         'deep': 1,
         'artificial': 1,
         'intelligence': 1,
         'mapreduce': 1})

In [64]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
