# What is data science

extract insights from messy data

set up hypothesis and evaluate it

identify hidden patterns

predict the future with hypothesized model

In [1]:
# define data scientist network

users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }
]

friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
(4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

print( users[0] )

{'id': 0, 'name': 'Hero'}


![network](./images/1.png)

In [2]:
# What's average number of connections ?

# construct friend map
for user in users:
    user["friends"] = []
for i, j in friendships:
    users[i]["friends"].append(users[j]) # add i as a friend of j
    users[j]["friends"].append(users[i]) # add j as a friend of i
    
def number_of_friends(user):
    """how many friends does _user_ have?"""
    return len(user["friends"])

total_connections = sum(number_of_friends(user) for user in users)

assert total_connections == 24

from __future__ import division  # integer division is lame
num_users = len(users) # length of the users list
avg_connections = total_connections / num_users 

assert avg_connections == 2.4     # <-- This is answer

In [3]:
# find most influential person ( i.e most connected person )

# create a list (user_id, number_of_friends)
num_friends_by_id = [(user["id"],number_of_friends(user)) for user in users]

# sort the list
sorted_result = sorted(num_friends_by_id,
                       key=lambda (user_id, num_friends): num_friends,  # sort key
                       reverse=True
                      )
print(sorted_result)
                      

[(1, 3), (2, 3), (3, 3), (5, 3), (8, 3), (0, 2), (4, 2), (6, 2), (7, 2), (9, 1)]


# as a degree of centraility
## but.. is id 4 central

![network](./images/2.png)

## Do better in Ch 21. Network analysis


In [4]:
# Data Scientists You may know ( a friends of friends )

def friends_of_friend_ids_bad(user):
    # "foaf" is short for "friend of a friend"
    return [foaf["id"]
            for friend in user["friends"] # for each of user's friends
            for foaf in friend["friends"] # get each of _their_ friends
           ]

assert friends_of_friend_ids_bad(users[0]) == [0,2,3,0,1,3]    # 0 included twice

# remove duplicates
from collections import Counter
def not_the_same(user, other_user):
    """two usrs are not the same if they have different ids"""
    return user["id"] != other_user["id"]
def not_friends(user,other_user):
    """other user is not a friend if he's not in user[friend] 
       that is, if th's not_the_same as all the people in user[friends]
    """
    return all(not_the_same(friend,other_user) for friend in user["friends"])
def friends_of_friend_ids(user):
    return Counter(foaf["id"]
                   for friend in user["friends"] # for each of my friends
                   for foaf in friend["friends"] # count their friends
                   if not_the_same(user, foaf)   # who are not me
                   and not_friends(user,foaf)    # and aren't my friends
                  )

assert friends_of_friend_ids_bad(users[3]) == [0, 2, 3, 0, 1, 3, 3, 5]
print(friends_of_friend_ids(users[3]))

Counter({0: 2, 5: 1})


In [5]:
# find users with a common interest

interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

from collections import defaultdict

# interest -> list of users
user_ids_by_interest = defaultdict(list)
for user_id,interest in interests:
    user_ids_by_interest[interest].append(user_id)

# user id -> list of interests
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

# find who has the most interest in common with a given user
def most_common_interests_with(user):
    return Counter(interested_user_id
                    for interest in interests_by_user_id[user["id"]]
                    for interested_user_id in user_ids_by_interest[interest]
                    if interested_user_id != user["id"]
                  )

print(most_common_interests_with(users[0]))  # check most common sharing person is 9


# Do better in Chapter 22. Recommnedation System 
    

Counter({9: 3, 1: 2, 8: 1, 5: 1})


In [6]:
# relationship between salaries and experience

salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
(48000, 0.7), (76000, 6),
(69000, 6.5), (76000, 7.5),
(60000, 2.5), (83000, 10),
(48000, 1.9), (63000, 4.2)]

# is it linearly dependent ?


![network](./images/3.png)

In [7]:
# find linearity between salaries and experince by bucketted histogram

def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"
    
salary_by_tenure_bucket = defaultdict(list)
for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

average_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.iteritems()
}

print(average_salary_by_bucket)


# Do better in Chapter 14. Simple Linear Regression


{'more than five': 79166.66666666667, 'between two and five': 61500.0, 'less than two': 48000.0}


In [9]:
# word count on interests

words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())

for word, count in words_and_counts.most_common():
    if count > 1:
        print word, count
        
# Do better in Ch 20 Natural Lanauge Processing        


learning 3
java 3
python 3
big 3
data 3
hbase 2
regression 2
cassandra 2
statistics 2
probability 2
hadoop 2
networks 2
machine 2
neural 2
scikit-learn 2
r 2
