# Chapter 1

## Finding key connectors

In [1]:
users = [{'id': 0, 'name': 'Hero'},
         {'id': 1, 'name': 'Dunn'},
         {'id': 2, 'name': 'Sue'},
         {'id': 3, 'name': 'Chi'},
         {'id': 4, 'name': 'Thor'},
         {'id': 5, 'name': 'Clive'},
         {'id': 6, 'name': 'Hicks'},
         {'id': 7, 'name': 'Devin'},
         {'id': 8, 'name': 'Kate'},
         {'id': 9, 'name': 'Klein'}]

In [2]:
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
               (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [3]:
for user in users:
    user["friends"] = []

In [4]:
for i, j in friendships:
    # this works because users[i] is the user whose id is i
    users[i]["friends"].append(users[j])
    users[j]["friends"].append(users[i])

# {'id': num, 'name': something, 'friends': [{'id': num, 'name': something, 'friends': [...]}, {'id': num, 'name': something, 'friends': [...]}, ...]}

In [5]:
def number_of_friends(user):
    """how many friends does _user_ have?"""
    return len(user["friends"])

In [6]:

total_connections = sum(number_of_friends(user)
                        for user in users)

total_connections

24

In [7]:
num_users = len(users)

num_users

10

In [8]:
avg_connections = total_connections/num_users
avg_connections

2.4

In [9]:
num_friends_by_id = [(user['id'], number_of_friends(user))
                      for user in users]

sorted(num_friends_by_id, key=lambda id_friends: id_friends[1] ,reverse=True) # sorted largest to smallest
# each pair is (user_id, num_friends)

[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

## Data scientists you may know

In [10]:
def friends_of_friends_ids_bad(user):
    # "foaf" is short for "friend of a friend"
    return [foaf['id'] 
            for friend in user['friends'] # for each of user's friends
            for foaf in friend['friends']] # get each of their friends

In [11]:
# We say is bad because the result includes the
# user itself and some of the foaf are repeted

friends_of_friends_ids_bad(users[0])

[0, 2, 3, 0, 1, 3]

In [12]:
print([friend['id'] for friend in users[0]['friends']])
print([friend['id'] for friend in users[1]['friends']])
print([friend['id'] for friend in users[2]['friends']])

[1, 2]
[0, 2, 3]
[0, 1, 3]


In [13]:
from collections import Counter

def not_the_same(user, other_user):
    """Two usersare not the same if thehave different ids"""
    return user['id'] != other_user['id']

def not_friends(user, other_user):
    """other_user is not a friend if he's not in user['friends'];
    that is, if he's not_the_same as all the people in user['friends']"""
    return all(not_the_same(friend, other_user) for friend in user['friends'])

def friends_of_friend_ids(user):
    return Counter(foaf['id']
                   for friend in user['friends']
                   for foaf in friend['friends']
                   if not_the_same(user, foaf)
                   and not_friends(user, foaf)) # Counter gives {value:counts}

In [14]:
print(friends_of_friend_ids(users[3]))
print(friends_of_friend_ids(users[0]))

Counter({0: 2, 5: 1})
Counter({3: 2})


In [15]:
# (user_id, interest)
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")]

In [16]:
# This is not efficient!!
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

In [17]:
from collections import defaultdict

# keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

user_ids_by_interest

defaultdict(list,
            {'Hadoop': [0, 9],
             'Big Data': [0, 8, 9],
             'HBase': [0, 1],
             'Java': [0, 5, 9],
             'Spark': [0],
             'Storm': [0],
             'Cassandra': [0, 1],
             'NoSQL': [1],
             'MongoDB': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'numpy': [2],
             'statsmodels': [2],
             'pandas': [2],
             'R': [3, 5],
             'statistics': [3, 6],
             'regression': [3, 4],
             'probability': [3, 6],
             'machine learning': [4, 7],
             'decision trees': [4],
             'libsvm': [4],
             'C++': [5],
             'Haskell': [5],
             'programming languages': [5],
             'mathematics': [6],
             'theory': [6],
             'Mahout': [7],
             'neural networks': [7, 8],
             'deep learning': 

In [18]:
# keys are user_ids, values are list of interest for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

interests_by_user_id

defaultdict(list,
            {0: ['Hadoop',
              'Big Data',
              'HBase',
              'Java',
              'Spark',
              'Storm',
              'Cassandra'],
             1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
             2: ['Python',
              'scikit-learn',
              'scipy',
              'numpy',
              'statsmodels',
              'pandas'],
             3: ['R', 'Python', 'statistics', 'regression', 'probability'],
             4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
             5: ['Python',
              'R',
              'Java',
              'C++',
              'Haskell',
              'programming languages'],
             6: ['statistics', 'probability', 'mathematics', 'theory'],
             7: ['machine learning',
              'scikit-learn',
              'Mahout',
              'neural networks'],
             8: ['neural networks',
              'deep learning',
       

In [19]:
def most_common_interests_with(user):
    return Counter(interested_user_id
                   for interest in interests_by_user_id[user['id']]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user['id'])

In [20]:
most_common_interests_with(users[0]) # {value:counts}

Counter({9: 3, 1: 2, 8: 1, 5: 1})

## Salaries and experience

In [None]:
# (salary, years_of_experince)
salaries_and_tenures = [
    (83000, 8.7), (88000, 8.1),
    (48000, 0.7), (76000, 6),
    (69000, 6.5), (76000, 7.5),
    (60000, 2.5), (83000, 10),
    (48000, 1.9), (63000, 4.2)]

In [None]:
# keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

# keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

# This is not so useful in this case becasuse it only list the same

average_salary_by_tenure

{8.7: 83000.0,
 8.1: 88000.0,
 0.7: 48000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 2.5: 60000.0,
 10: 83000.0,
 1.9: 48000.0,
 4.2: 63000.0}

In [31]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [34]:
# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

# keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

In [35]:
average_salary_by_bucket

{'more than five': 79166.66666666667,
 'less than two': 48000.0,
 'between two and five': 61500.0}

In [37]:
words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

In [None]:
for word, count in words_and_counts.most_common(): # most_common() is a method from Counter
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2
