# Numpy and Pandas!

In [11]:
import numpy as np
import pandas as pd

## Fun with Numpy!

### Numpy Arrays

In [11]:
array = np.array([1, 4, 5, 8], float)
print (array)

[ 1.  4.  5.  8.]


In [12]:
array = np.array([[1, 2, 3], [4, 5, 6]], float)  # a 2D array/Matrix
print (array)

[[ 1.  2.  3.]
 [ 4.  5.  6.]]


### Index, slice and manipulation of a Numpy array (like a python List!)

In [7]:
array = np.array([1, 4, 5, 8], float)
print (array)


[ 1.  4.  5.  8.]
4.0


In [8]:
print (array[1])

4.0


In [9]:
print (array[:2])

[ 1.  4.]


In [10]:
array[1] = 5.0
print (array[1])

5.0


### Matrix indexing and slicing

In [13]:
two_D_array = np.array([[1, 2, 3], [4, 5, 6]], float)
print (two_D_array)

[[ 1.  2.  3.]
 [ 4.  5.  6.]]


In [14]:
print (two_D_array[1][1])

5.0


In [15]:
print (two_D_array[1, :])

[ 4.  5.  6.]


In [16]:
print (two_D_array[:, 2])

[ 3.  6.]


### Array arithmetics 

In [17]:
array_1 = np.array([1, 2, 3], float)
array_2 = np.array([5, 2, 6], float)
print (array_1 + array_2)

[ 6.  4.  9.]


In [18]:
print (array_1 - array_2)

[-4.  0. -3.]


In [19]:
print (array_1 * array_2)

[  5.   4.  18.]


### Matrix arithmetics

In [25]:
array_1 = np.array([[1, 2], [3, 4]], float)
array_2 = np.array([[5, 6], [7, 8]], float)

print (array_1 + array_2)

[[  6.   8.]
 [ 10.  12.]]


In [21]:
print (array_1 - array_2)

[[-4. -4.]
 [-4. -4.]]


In [22]:
print (array_1 * array_2)

[[  5.  12.]
 [ 21.  32.]]


### Numpy has a range of other mathematical operations! (mean, dot product...)

In [27]:
array_1 = np.array([1, 2, 3], float)
array_2 = np.array([[6], [7], [8]], float)

print (np.mean(array_1))

2.0


In [28]:
print (np.mean(array_2))

7.0


In [29]:
print (np.dot(array_1, array_2)) # a1b2 + a2b2 + ... + anbn

[ 44.]


## Fun with Pandas!

### Pandas DataFrame (default index)

DataFrame: a group of Series that share an index

1) Selecting a single column from DataFrame will return a Series

2) Selecting multiple columns from DataFrame will return a Dataframe

In [30]:
# Dictionary
d = {
	'name': 
		pd.Series(
			['Braund','Cummings','Heikkinen','Allen']
		),
	'age': 
		pd.Series(
			[22,38,26,35]
		),
	'fare':
		pd.Series(
			[7.25, 71.83, 8.05]
		),
	'survived?':
		pd.Series(
			[False, True, True, False]
		)
	}

In [31]:
df = pd.DataFrame(d)
print (df)

   age   fare       name  survived?
0   22   7.25     Braund      False
1   38  71.83   Cummings       True
2   26   8.05  Heikkinen       True
3   35    NaN      Allen      False


## Pandas DataFrame (custom index)

In [32]:
d = {
	'name': 
		pd.Series(
			['Braund','Cummings','Heikkinen','Allen'],
			index = ['a','b','c','d']
		),
	'age': 
		pd.Series(
			[22,38,26,35],
			index = ['a','b','c','d']
		),
	'fare':
		pd.Series(
			[7.25, 71.83, 8.05],
			index = ['a','b','d']
		),
	'survived?':
		pd.Series(
			[False, True, True, False],
			index = ['a','b','c','d']
		)
}

In [33]:
df = pd.DataFrame(d)
print (df)

   age   fare       name  survived?
a   22   7.25     Braund      False
b   38  71.83   Cummings       True
c   26    NaN  Heikkinen       True
d   35   8.05      Allen      False


### Series Object (default index)

In [34]:
series = pd.Series(['Dave', 'Cheng-Han', 'Udacity', 42, -1789710578])
print (series)

0           Dave
1      Cheng-Han
2        Udacity
3             42
4    -1789710578
dtype: object


### Series Object (custom index)

In [35]:
series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                   index=['Instructor', 'Curriculum Manager',
                          'Course Number', 'Power Level'])
print (series)

Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
Power Level                9001
dtype: object


### Series Indexing: select specific items from Series Object

In [37]:
series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                   index=['Instructor', 'Curriculum Manager',
                          'Course Number', 'Power Level'])
print (series['Instructor'])

Dave


In [38]:
print (series[['Instructor', 'Curriculum Manager', 'Course Number']])

Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
dtype: object


### Boolean Indexing: select specific items from Series Object

In [39]:
cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig',
                                             'Puppy', 'Kitten'])
print (cuteness > 3)

Cockroach    False
Fish         False
Mini Pig     False
Puppy         True
Kitten        True
dtype: bool


In [40]:
print (cuteness[cuteness > 3])

Puppy     4
Kitten    5
dtype: int64


# Intro to Data Science Course Quizzes: DataFrame construction!

## 1) 2014 Sochi Winter Olympics 

In [95]:
def create_dataframe():
    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    d = {
        'country_name': 
            pd.Series(
                countries
            ),
        'gold': 
            pd.Series(
                gold
            ),
        'silver':
            pd.Series(
                silver
            ),
        'bronze':
            pd.Series(
                bronze
            )
    }
    
    olympic_medal_counts_df = pd.DataFrame(d)

    return olympic_medal_counts_df

In [42]:
df = create_dataframe()
print (df)

    bronze    country_name  gold  silver
0        9    Russian Fed.    13      11
1       10          Norway    11       5
2        5          Canada    10      10
3       12   United States     9       7
4        9     Netherlands     8       7
5        5         Germany     8       6
6        2     Switzerland     6       3
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1      

### Catch columns

In [43]:
print (df[['country_name','gold']])

      country_name  gold
0     Russian Fed.    13
1           Norway    11
2           Canada    10
3    United States     9
4      Netherlands     8
5          Germany     8
6      Switzerland     6
7          Belarus     5
8          Austria     4
9           France     4
10          Poland     4
11           China     3
12           Korea     3
13          Sweden     2
14  Czech Republic     2
15        Slovenia     2
16           Japan     1
17         Finland     1
18   Great Britain     1
19         Ukraine     1
20        Slovakia     1
21           Italy     0
22          Latvia     0
23       Australia     0
24         Croatia     0
25      Kazakhstan     0


### Catch a row

In [46]:
df.loc[16]

bronze              3
country_name    Japan
gold                1
silver              4
Name: 16, dtype: object

### Filter (using queries)

In [48]:
print (df[df['gold'] <= 5])

    bronze    country_name  gold  silver
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1       Australia     0       2
24       0         Croatia     0       1
25       1      Kazakhstan     0       0


### Catch only 'country_name' column if 'gold' <= 5

In [51]:
print (df['country_name'][df['gold'] <= 5])

7            Belarus
8            Austria
9             France
10            Poland
11             China
12             Korea
13            Sweden
14    Czech Republic
15          Slovenia
16             Japan
17           Finland
18     Great Britain
19           Ukraine
20          Slovakia
21             Italy
22            Latvia
23         Australia
24           Croatia
25        Kazakhstan
Name: country_name, dtype: object


## 2) NFL

In [75]:
data = {
    'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
    'team': ['Seahawks', 'Seahawks', 'Seahawks', 'Eagles', 'Eagles', 'Eagles','Pats', 'Pats'],
    'wins': [16, 14, 12, 15, 11, 6, 0, 1],
    'losses': [0, 2, 4, 1, 5, 10, 16, 15]
}

football = pd.DataFrame(data)
print (football)

   losses      team  wins  year
0       0  Seahawks    16  2010
1       2  Seahawks    14  2011
2       4  Seahawks    12  2012
3       1    Eagles    15  2011
4       5    Eagles    11  2012
5      10    Eagles     6  2010
6      16      Pats     0  2011
7      15      Pats     1  2012


### Filter by Column

In [66]:
print (football['year'])

0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64


In [67]:
print (football.year)  # shorthand for football['year']

0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64


### Filter by ColumnS

In [68]:
print (football[['year', 'wins', 'losses']])

   year  wins  losses
0  2010    16       0
1  2011    14       2
2  2012    12       4
3  2011    15       1
4  2012    11       5
5  2010     6      10
6  2011     0      16
7  2012     1      15


### Get a Row

In [69]:
print (football.iloc[[0]])

   losses      team  wins  year
0       0  Seahawks    16  2010


In [70]:
print (football.loc[[0]])

   losses      team  wins  year
0       0  Seahawks    16  2010


### Get Rows

In [71]:
print (football[3:5])

   losses    team  wins  year
3       1  Eagles    15  2011
4       5  Eagles    11  2012


### Filter "where"

In [72]:
print (football[football.wins > 10])

   losses      team  wins  year
0       0  Seahawks    16  2010
1       2  Seahawks    14  2011
2       4  Seahawks    12  2012
3       1    Eagles    15  2011
4       5    Eagles    11  2012


In [80]:
print (football[(football.wins > 10) & (football.team == "Seahawks")])

   losses      team  wins  year
0       0  Seahawks    16  2010
1       2  Seahawks    14  2011
2       4  Seahawks    12  2012


## 3) DataFrame cool operations

In [88]:
d = {
    'one': 
        pd.Series(
            [1,2,3],
            index = ['a','b','c']
        ),
    'two':
        pd.Series(
            [1,2,3,4],
            index = ['a','b','c','d']
        )
}
df = pd.DataFrame(d)
print (df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


### Apply mean function: mean of each column

In [90]:
df.apply(np.mean)

one    2.0
two    2.5
dtype: float64

### Lambda function: goes through every single value in the 'one' column and evaluates if is >= 1

In [92]:
df['one'].map(lambda x: x >= 1)

a     True
b     True
c     True
d    False
Name: one, dtype: bool

### Evaluate every single value in the DataFrame

In [93]:
df.applymap(lambda x: x >= 1)

Unnamed: 0,one,two
a,True,True
b,True,True
c,True,True
d,False,True


## Using Winter Olympics DataFrame

In [97]:
df = create_dataframe()
print (df)

    bronze    country_name  gold  silver
0        9    Russian Fed.    13      11
1       10          Norway    11       5
2        5          Canada    10      10
3       12   United States     9       7
4        9     Netherlands     8       7
5        5         Germany     8       6
6        2     Switzerland     6       3
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1      

In [108]:
df[['gold','silver','bronze']].apply(np.mean)

gold      3.807692
silver    3.730769
bronze    3.807692
dtype: float64

In [109]:
df[['silver','bronze']].apply(np.mean)

silver    3.730769
bronze    3.807692
dtype: float64

## Dot Product

In [111]:
a = [1,2,3,4,5]
b = [2,3,4,5,6]

In [113]:
np.dot(a,b)

70

## Matrix Multiplication

In [116]:
a = np.array([1,2], int) # [] 1x2
b = np.array([[2, 4, 6], [3, 5, 7]], int) # [] 2x3

In [117]:
np.dot(a,b) # [] 1x3

array([ 8, 14, 20])