In [1]:
import numpy as np
import pandas as pd

In [2]:
pop2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['Java','C','C++','Python','C#','PHP','JavaScript','Ruby','R','Matlab'])

In [3]:
pop2015 = pd.Series({'Java': 100,'C': 99.9,'C++': 99.4,'Python': 96.5,'C#': 91.3,
                     'R': 84.8,'PHP': 84.5,'JavaScript': 83.0,'Ruby': 76.2,'Matlab': 72.4})

In [5]:
two_years = pd.DataFrame({'2014': pop2014, '2015': pop2015})
two_years

Unnamed: 0,2014,2015
C,99.3,99.9
C#,92.4,91.3
C++,95.5,99.4
Java,100.0,100.0
JavaScript,84.5,83.0
Matlab,72.8,72.4
PHP,84.8,84.5
Python,93.5,96.5
R,74.3,84.8
Ruby,78.9,76.2


In [10]:
two_years.sort_values('2015', inplace=True)
two_years

Unnamed: 0,2014,2015
Matlab,72.8,72.4
Ruby,78.9,76.2
JavaScript,84.5,83.0
PHP,84.8,84.5
R,74.3,84.8
C#,92.4,91.3
Python,93.5,96.5
C++,95.5,99.4
C,99.3,99.9
Java,100.0,100.0


In [11]:
# there is a numpy array inside every data frame
# we can also call for the attribute (values), index and columns
two_years.values

array([[  72.8,   72.4],
       [  78.9,   76.2],
       [  84.5,   83. ],
       [  84.8,   84.5],
       [  74.3,   84.8],
       [  92.4,   91.3],
       [  93.5,   96.5],
       [  95.5,   99.4],
       [  99.3,   99.9],
       [ 100. ,  100. ]])

In [12]:
two_years.index

Index(['Matlab', 'Ruby', 'JavaScript', 'PHP', 'R', 'C#', 'Python', 'C++', 'C',
       'Java'],
      dtype='object')

In [13]:
two_years.columns

Index(['2014', '2015'], dtype='object')

In [19]:
two_years.loc['Matlab':'R']

Unnamed: 0,2014,2015
Matlab,72.8,72.4
Ruby,78.9,76.2
JavaScript,84.5,83.0
PHP,84.8,84.5
R,74.3,84.8


In [21]:
avg_two_years = 0.5*(two_years['2014'] + two_years['2015'])
avg_two_years

Matlab         72.60
Ruby           77.55
JavaScript     83.75
PHP            84.65
R              79.55
C#             91.85
Python         95.00
C++            97.45
C              99.60
Java          100.00
dtype: float64

In [23]:
two_years.append(avg_two_years, ignore_index=True)
# why is this error happening? where is index matching?

Unnamed: 0,2014,2015,C,C#,C++,Java,JavaScript,Matlab,PHP,Python,R,Ruby
0,72.8,72.4,,,,,,,,,,
1,78.9,76.2,,,,,,,,,,
2,84.5,83.0,,,,,,,,,,
3,84.8,84.5,,,,,,,,,,
4,74.3,84.8,,,,,,,,,,
5,92.4,91.3,,,,,,,,,,
6,93.5,96.5,,,,,,,,,,
7,95.5,99.4,,,,,,,,,,
8,99.3,99.9,,,,,,,,,,
9,100.0,100.0,,,,,,,,,,


In [24]:
two_years['Average'] = 0.5*(two_years['2014'] + two_years['2015'])
two_years

Unnamed: 0,2014,2015,Average
Matlab,72.8,72.4,72.6
Ruby,78.9,76.2,77.55
JavaScript,84.5,83.0,83.75
PHP,84.8,84.5,84.65
R,74.3,84.8,79.55
C#,92.4,91.3,91.85
Python,93.5,96.5,95.0
C++,95.5,99.4,97.45
C,99.3,99.9,99.6
Java,100.0,100.0,100.0


In [25]:
presidents = pd.DataFrame([{'name': 'Barack Obama','inauguration': 2009,'birthyear': 1961},
                          {'name': 'George W. Bush','inauguration': 2001,'birthyear': 1946},
                          {'name': 'Bill Clinton','birthyear': 1946,'inauguration': 1993},
                          {'name': 'George H. W. Bush','inauguration': 1989,'birthyear': 1924}])

In [26]:
presidents.index

RangeIndex(start=0, stop=4, step=1)

In [28]:
presidents_indexes = presidents.set_index('inauguration')
presidents_indexes

Unnamed: 0_level_0,birthyear,name
inauguration,Unnamed: 1_level_1,Unnamed: 2_level_1
2009,1961,Barack Obama
2001,1946,George W. Bush
1993,1946,Bill Clinton
1989,1924,George H. W. Bush


In [29]:
presidents_indexes = presidents.set_index('name')
presidents_indexes

Unnamed: 0_level_0,birthyear,inauguration
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,1961,2009
George W. Bush,1946,2001
Bill Clinton,1946,1993
George H. W. Bush,1924,1989


In [30]:
presidents_indexes.loc['Bill Clinton']['birthyear']

1946

In [32]:
# it works the other way around too! BUT without LOC function
presidents_indexes['birthyear']['Bill Clinton']

1946

In [34]:
presidents_fathers = pd.DataFrame([{'son': 'Barack Obama', 'father': 'Barack Obama Sr.'},
                                    {'son': 'George W. Bush','father': 'George H. W. Bush'},
                                   {'son': 'George H. W. Bush','father': 'Prescott Bush'}])

In [37]:
presidents_merge = pd.merge(presidents, presidents_fathers, left_on='name', right_on='son')
presidents_merge

Unnamed: 0,birthyear,inauguration,name,father,son
0,1961,2009,Barack Obama,Barack Obama Sr.,Barack Obama
1,1946,2001,George W. Bush,George H. W. Bush,George W. Bush
2,1924,1989,George H. W. Bush,Prescott Bush,George H. W. Bush


In [42]:
# drop son column
presidents_merge.drop('son', axis = 1)

Unnamed: 0,birthyear,inauguration,name,father
0,1961,2009,Barack Obama,Barack Obama Sr.
1,1946,2001,George W. Bush,George H. W. Bush
2,1924,1989,George H. W. Bush,Prescott Bush


In [45]:
# but where is bill clinton? 
# he didn't have a father specified, so it wasn't included in the left join
# use right join to include bill clinton in the final result
presidents_merge_bc = pd.merge(presidents, presidents_fathers, left_on='name', how = 'left', right_on='son')
presidents_merge_bc.drop('son', axis = 1)
presidents_merge_bc

Unnamed: 0,birthyear,inauguration,name,father,son
0,1961,2009,Barack Obama,Barack Obama Sr.,Barack Obama
1,1946,2001,George W. Bush,George H. W. Bush,George W. Bush
2,1946,1993,Bill Clinton,,
3,1924,1989,George H. W. Bush,Prescott Bush,George H. W. Bush
