In [54]:
import pandas as pd

'''
The following code is to help you play with the concept of Series in Pandas.

You can think of Series as an one-dimensional object that is similar to
an array, list, or column in a database. By default, it will assign an
index label to each item in the Series ranging from 0 to N, where N is
the number of items in the Series minus one.

Please feel free to play around with the concept of Series and see what it does

*This playground is inspired by Greg Reda's post on Intro to Pandas Data Structures:
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/
'''
#  create a Series object

series = pd.Series(['Dave', 'Cheng-Han', 'Udacity', 42, -1789710578])
series


0           Dave
1      Cheng-Han
2        Udacity
3             42
4    -1789710578
dtype: object

In [55]:

'''
You can also manually assign indices to the items in the Series when
creating the series
'''

# see custom index in action

series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
series


Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
Power Level                9001
dtype: object

In [56]:

'''
You can use index to select specific items from the Series
'''
# see Series indexing in action

series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
series['Instructor']

'Dave'

In [57]:
series[['Instructor', 'Curriculum Manager', 'Course Number']]

Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
dtype: object

In [58]:
'''
You can also use boolean operators to select specific items from the Series
'''
# see boolean indexing in action

cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig',
                                                 'Puppy', 'Kitten'])
cuteness > 3

Cockroach    False
Fish         False
Mini Pig     False
Puppy         True
Kitten        True
dtype: bool

In [59]:
cuteness[cuteness > 3]

Puppy     4
Kitten    5
dtype: int64

In [60]:
from pandas import DataFrame, Series

#################
# Syntax Reminder:
#
# The following code would create a two-column pandas DataFrame
# named df with columns labeled 'name' and 'age':
#
# people = ['Sarah', 'Mike', 'Chrisna']
# ages  =  [28, 32, 25]
# df = DataFrame({'name' : Series(people),
#                 'age'  : Series(ages)})

def create_dataframe():
    '''
    Create a pandas dataframe called 'olympic_medal_counts_df' containing
    the data from the table of 2014 Sochi winter olympics medal counts.  

    The columns for this dataframe should be called 
    'country_name', 'gold', 'silver', and 'bronze'.  

    There is no need to  specify row indexes for this dataframe 
    (in this case, the rows will automatically be assigned numbered indexes).
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts_df = DataFrame({'country_name': Series(countries),
                   'gold': Series(gold),
                   'silver':Series(silver),
                   'bronze':Series(bronze)})

    return olympic_medal_counts_df

olympic_medal_counts_df = create_dataframe

olympic_medal_counts_df


<function __main__.create_dataframe()>

In [61]:
    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    df = DataFrame({'country_name': Series(countries),
                   'gold': Series(gold),
                   'silver':Series(silver),
                   'bronze':Series(bronze)})

df

Unnamed: 0,country_name,gold,silver,bronze
0,Russian Fed.,13,11,9
1,Norway,11,5,10
2,Canada,10,10,5
3,United States,9,7,12
4,Netherlands,8,7,9
5,Germany,8,6,5
6,Switzerland,6,3,2
7,Belarus,5,0,1
8,Austria,4,8,5
9,France,4,4,7


In [62]:
# just one column: returns a series
df['country_name']

0       Russian Fed.
1             Norway
2             Canada
3      United States
4        Netherlands
5            Germany
6        Switzerland
7            Belarus
8            Austria
9             France
10            Poland
11             China
12             Korea
13            Sweden
14    Czech Republic
15          Slovenia
16             Japan
17           Finland
18     Great Britain
19           Ukraine
20          Slovakia
21             Italy
22            Latvia
23         Australia
24           Croatia
25        Kazakhstan
Name: country_name, dtype: object

In [63]:
# more than one: returns a dataframe
df[['country_name', 'gold']]

Unnamed: 0,country_name,gold
0,Russian Fed.,13
1,Norway,11
2,Canada,10
3,United States,9
4,Netherlands,8
5,Germany,8
6,Switzerland,6
7,Belarus,5
8,Austria,4
9,France,4


In [64]:
# rownumber
df.loc[22] 

country_name    Latvia
gold                 0
silver               2
bronze               2
Name: 22, dtype: object

In [65]:
df[4:7]

Unnamed: 0,country_name,gold,silver,bronze
4,Netherlands,8,7,9
5,Germany,8,6,5
6,Switzerland,6,3,2


In [66]:
# boolean on row 
df[(df.gold >=5) & (df.silver>9)]

Unnamed: 0,country_name,gold,silver,bronze
0,Russian Fed.,13,11,9
2,Canada,10,10,5


In [67]:
#  condition
df[df['gold']>=10]

Unnamed: 0,country_name,gold,silver,bronze
0,Russian Fed.,13,11,9
1,Norway,11,5,10
2,Canada,10,10,5


In [68]:
# same
df[df.gold >= 10]

Unnamed: 0,country_name,gold,silver,bronze
0,Russian Fed.,13,11,9
1,Norway,11,5,10
2,Canada,10,10,5


In [69]:
# Pandas vectorized methods
'''
lambda functions are small inline functions that are defined on-the fly
in Python. lambda x: x>= 1 will take an input x and return x>=1, or a
boolean that equals True or False.

In this example, ,map() and applymap() create a new series or dataframe
by applying the lambda function to each element.
Note that map() can only be used on a series to return a new series and 
applymap() can only be used on a dataframe to return a new dataframe.
'''

d = {'one': Series([1, 2, 3], index = ['a', 'b', 'c']),
    'two': Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])}

df2 = DataFrame(d)
df2

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [70]:
import numpy as np
df2.apply(np.mean)

one    2.0
two    2.5
dtype: float64

In [71]:
# one column
df2['one'].map(lambda x: x>=1)

a     True
b     True
c     True
d    False
Name: one, dtype: bool

In [72]:
# evaluates for whole dataframe
df2.applymap(lambda x: x>=1)

Unnamed: 0,one,two
a,True,True
b,True,True
c,True,True
d,False,True


In [73]:
import numpy as np
# numpy.dot multiplication
# calculates the sum of the products 
a = [1,2,3,4,5]
b = [2,3,4,5,6]
np.dot(a,b)

70

In [83]:
'''
    Imagine a point system in which each country is awarded 4 points for each
    gold medal,  2 points for each silver medal, and one point for each 
    bronze medal.  

    Using the numpy.dot function, create a new dataframe called 
    'olympic_points_df' that includes:
        a) a column called 'country_name' with the country name
        b) a column called 'points' with the total number of points the country
           earned at the Sochi olympics.
'''

olympic_medal_counts = {'country_name': Series(countries),
                       'gold': Series(gold),
                       'silver': Series(silver),
                       'bronze': Series(bronze)}
olympic_medal_counts_df = DataFrame(olympic_medal_counts)

medal_counts = olympic_medal_counts_df[['gold', 'silver', 'bronze']]
points = np.dot(medal_counts, [4,2,1])

# python dictonairy
olympic_points = {'country_name': Series(countries), 'points': Series(points)}
# array of points 
olympic_points_df = DataFrame(olympic_points)

olympic_points_df


Unnamed: 0,country_name,points
0,Russian Fed.,83
1,Norway,64
2,Canada,65
3,United States,62
4,Netherlands,55
5,Germany,49
6,Switzerland,32
7,Belarus,21
8,Austria,37
9,France,31


In [85]:
# alternate solution using only pandas
df['points'] = df[['gold','silver','bronze']].dot([4, 2, 1]) 

olympic_points_df = df[['country_name','points']]

olympic_points_df

Unnamed: 0,country_name,points
0,Russian Fed.,83
1,Norway,64
2,Canada,65
3,United States,62
4,Netherlands,55
5,Germany,49
6,Switzerland,32
7,Belarus,21
8,Austria,37
9,France,31
