In [1]:
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

print(users)

[{'id': 0, 'name': 'Hero'}, {'id': 1, 'name': 'Dunn'}, {'id': 2, 'name': 'Sue'}, {'id': 3, 'name': 'Chi'}, {'id': 4, 'name': 'Thor'}, {'id': 5, 'name': 'Clive'}, {'id': 6, 'name': 'Hicks'}, {'id': 7, 'name': 'Devin'}, {'id': 8, 'name': 'Kate'}, {'id': 9, 'name': 'Klein'}]


In [2]:
friendship_pairs = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
print(friendship_pairs)

[(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]


In [3]:
friendships = {user["id"]: [] for user in users}
print(friendships)

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []}


In [4]:
for i, j in friendship_pairs:
    friendships[i].append(j) # j is a friend of user i
    friendships[j].append(i) # i is a friend of user j
    
print(friendships)

{0: [1, 2], 1: [0, 2, 3], 2: [0, 1, 3], 3: [1, 2, 4], 4: [3, 5], 5: [4, 6, 7], 6: [5, 8], 7: [5, 8], 8: [6, 7, 9], 9: [8]}


In [5]:
def number_of_friends(user):
    """How many friends does _user have?"""
    user_id = user["id"]
    friend_ids = friendships[user_id]
    num = len(friend_ids)
    print(f'User {user_id} has {num} connections.')
    return num

total_connections = sum(number_of_friends(user) for user in users)

print(total_connections)

User 0 has 2 connections.
User 1 has 3 connections.
User 2 has 3 connections.
User 3 has 3 connections.
User 4 has 2 connections.
User 5 has 3 connections.
User 6 has 2 connections.
User 7 has 2 connections.
User 8 has 3 connections.
User 9 has 1 connections.
24


In [6]:
num_users = len(users)
print(num_users)

10


In [7]:
avg_connections = total_connections / num_users
print(avg_connections)

2.4


In [8]:
num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
print(num_friends_by_id)

User 0 has 2 connections.
User 1 has 3 connections.
User 2 has 3 connections.
User 3 has 3 connections.
User 4 has 2 connections.
User 5 has 3 connections.
User 6 has 2 connections.
User 7 has 2 connections.
User 8 has 3 connections.
User 9 has 1 connections.
[(0, 2), (1, 3), (2, 3), (3, 3), (4, 2), (5, 3), (6, 2), (7, 2), (8, 3), (9, 1)]


In [9]:
num_friends_by_id.sort(key=lambda id_and_friends: id_and_friends[1], reverse=True)
print(num_friends_by_id)

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


In [10]:
def foaf_ids_bad(user):
    """friend of a friend"""
    num = [foaf_id for friend_id in friendships[user["id"]] for foaf_id in friendships[friend_id]]
    return num

print(foaf_ids_bad(users[0]))

[0, 2, 3, 0, 1, 3]


In [11]:
from collections import Counter

def friends_of_friends(user):
    user_id = user["id"]
    num = Counter(foaf_id 
                  for friend_id in friendships[user_id] 
                  for foaf_id in friendships[friend_id]
                  if foaf_id != user_id
                  and foaf_id not in friendships[user_id]
                 )
    return num
    
print(friends_of_friends(users[3]))

Counter({0: 2, 5: 1})


In [12]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [13]:
def data_scientists_who_like(target_interest):
    """Find ids of users who like target interest"""
    return [user_id for user_id, user_interest in interests
           if user_interest==target_interest]

print(data_scientists_who_like("Java"))

[0, 5, 9]


In [14]:
from collections import defaultdict

# Keys are interests, values are lists of user_ids with that interest
user_ids_by_interest = defaultdict(list)

for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    
print(user_ids_by_interest)

defaultdict(<class 'list'>, {'Hadoop': [0, 9], 'Big Data': [0, 8, 9], 'HBase': [0, 1], 'Java': [0, 5, 9], 'Spark': [0], 'Storm': [0], 'Cassandra': [0, 1], 'NoSQL': [1], 'MongoDB': [1], 'Postgres': [1], 'Python': [2, 3, 5], 'scikit-learn': [2, 7], 'scipy': [2], 'numpy': [2], 'statsmodels': [2], 'pandas': [2], 'R': [3, 5], 'statistics': [3, 6], 'regression': [3, 4], 'probability': [3, 6], 'machine learning': [4, 7], 'decision trees': [4], 'libsvm': [4], 'C++': [5], 'Haskell': [5], 'programming languages': [5], 'mathematics': [6], 'theory': [6], 'Mahout': [7], 'neural networks': [7, 8], 'deep learning': [8], 'artificial intelligence': [8], 'MapReduce': [9]})


In [15]:
# Keys are user_ids, values are lists of interests for that user_id
interests_by_user_id = defaultdict(list)

for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)
    
print(interests_by_user_id)

defaultdict(<class 'list'>, {0: ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'], 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'], 2: ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'], 3: ['R', 'Python', 'statistics', 'regression', 'probability'], 4: ['machine learning', 'regression', 'decision trees', 'libsvm'], 5: ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'], 6: ['statistics', 'probability', 'mathematics', 'theory'], 7: ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'], 8: ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'], 9: ['Hadoop', 'Java', 'MapReduce', 'Big Data']})


In [16]:
def most_common_interests_with(user):
    return Counter(
        interested_user_id
        for interest in interests_by_user_id[user["id"]]
        for interested_user_id in user_ids_by_interest[interest]
        if interested_user_id != user["id"]
    )

In [17]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]


In [18]:
# keys are years, values are lists of the salaries for each tenure
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
    
# keys are years, each value is average salary for that tenure
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [19]:
print(salary_by_tenure)

defaultdict(<class 'list'>, {8.7: [83000], 8.1: [88000], 0.7: [48000], 6: [76000], 6.5: [69000], 7.5: [76000], 2.5: [60000], 10: [83000], 1.9: [48000], 4.2: [63000]})


In [20]:
print(average_salary_by_tenure)

{8.7: 83000.0, 8.1: 88000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}


In [21]:
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than 2"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

In [22]:
# keys are tenure buckets, values are lists of salaries for that bucket
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)
    
print(salary_by_tenure_bucket)

defaultdict(<class 'list'>, {'more than five': [83000, 88000, 76000, 69000, 76000, 83000], 'less than 2': [48000, 48000], 'between two and five': [60000, 63000]})


In [23]:
# keys are tenure buckets, values are average salary for that bucket
average_salary_by_bucket = {
    tenure_bucket: sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}

print(average_salary_by_bucket)

{'more than five': 79166.66666666667, 'less than 2': 48000.0, 'between two and five': 61500.0}


In [24]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [25]:
words_and_counts = Counter(word
                          for user, interest in interests
                          for word in interest.lower().split())

print(words_and_counts)

Counter({'big': 3, 'data': 3, 'java': 3, 'python': 3, 'learning': 3, 'hadoop': 2, 'hbase': 2, 'cassandra': 2, 'scikit-learn': 2, 'r': 2, 'statistics': 2, 'regression': 2, 'probability': 2, 'machine': 2, 'neural': 2, 'networks': 2, 'spark': 1, 'storm': 1, 'nosql': 1, 'mongodb': 1, 'postgres': 1, 'scipy': 1, 'numpy': 1, 'statsmodels': 1, 'pandas': 1, 'decision': 1, 'trees': 1, 'libsvm': 1, 'c++': 1, 'haskell': 1, 'programming': 1, 'languages': 1, 'mathematics': 1, 'theory': 1, 'mahout': 1, 'deep': 1, 'artificial': 1, 'intelligence': 1, 'mapreduce': 1})


In [26]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
machine 2
neural 2
networks 2


In [27]:
tweet = {
    "user": "joelgrus",
    "text": "Data Science is awesome",
    "retweet_count": 90,
    "hashtags": ["#data", "#science", "#datascience", "#awesome", "#yolo"]
}

In [28]:
print(tweet.keys())

dict_keys(['user', 'text', 'retweet_count', 'hashtags'])


In [29]:
print(tweet.values())

dict_values(['joelgrus', 'Data Science is awesome', 90, ['#data', '#science', '#datascience', '#awesome', '#yolo']])


In [30]:
print(tweet.items())

dict_items([('user', 'joelgrus'), ('text', 'Data Science is awesome'), ('retweet_count', 90), ('hashtags', ['#data', '#science', '#datascience', '#awesome', '#yolo'])])


In [31]:
print(90 in tweet)

False


In [32]:
print(90 in tweet.values())

True


In [33]:
print("text" in tweet)

True


In [37]:
document = "hey this is a test of a python dict test this should show us if the counts of words works"
print(document)

hey this is a test of a python dict test this should show us if the counts of words works


In [38]:
document = document.split(" ")
print(document)

['hey', 'this', 'is', 'a', 'test', 'of', 'a', 'python', 'dict', 'test', 'this', 'should', 'show', 'us', 'if', 'the', 'counts', 'of', 'words', 'works']


In [39]:
word_counts = {}
for word in document:
    try:
        word_counts[word] += 1
    except:
        word_counts[word] = 1
        
print(word_counts)

{'hey': 1, 'this': 2, 'is': 1, 'a': 2, 'test': 2, 'of': 2, 'python': 1, 'dict': 1, 'should': 1, 'show': 1, 'us': 1, 'if': 1, 'the': 1, 'counts': 1, 'words': 1, 'works': 1}


In [40]:
from collections import Counter
word_counts = Counter(document)
print(word_counts)

Counter({'this': 2, 'a': 2, 'test': 2, 'of': 2, 'hey': 1, 'is': 1, 'python': 1, 'dict': 1, 'should': 1, 'show': 1, 'us': 1, 'if': 1, 'the': 1, 'counts': 1, 'words': 1, 'works': 1})


In [41]:
print(word_counts.most_common(10))

[('this', 2), ('a', 2), ('test', 2), ('of', 2), ('hey', 1), ('is', 1), ('python', 1), ('dict', 1), ('should', 1), ('show', 1)]


In [42]:
for word, count in word_counts.most_common(10):
    print(word, count)

this 2
a 2
test 2
of 2
hey 1
is 1
python 1
dict 1
should 1
show 1


In [43]:
# {"": , "": } DICT
# [] LIST
# () TUPLE
# {2, 3} SET, no duplications

# 'in' is very fast on sets and dicts

def remove_duplicates_in_list(words):
    return list(set(words))

In [44]:
print(type(document))

<class 'list'>


In [45]:
document_no_duplicates = remove_duplicates_in_list(document)
print(document_no_duplicates)

['show', 'works', 'a', 'this', 'us', 'if', 'of', 'dict', 'the', 'hey', 'test', 'is', 'counts', 'should', 'python', 'words']


In [46]:
print(type(document_no_duplicates))

<class 'list'>


In [47]:
x = sorted(word_counts.most_common(10), key=lambda word_and_count: word_and_count[1], reverse=True)
print(x)

[('this', 2), ('a', 2), ('test', 2), ('of', 2), ('hey', 1), ('is', 1), ('python', 1), ('dict', 1), ('should', 1), ('show', 1)]


In [49]:
x2 = sorted(word_counts.values(), reverse=True)
print(x2)

[2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [52]:
class CountClicker:
    """Class with a docstring"""
    def __init__(self, count=0):
        self.count = count

In [53]:
clicker1 = CountClicker()
clicker2 = CountClicker(100)
clicker3 = CountClicker(count=100)