## Python statistics essential training - 02_03_pandas

Standard imports

In [28]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
planets = pd.read_csv('Planets.csv',usecols = [0,1,2,3])

In [5]:
planets

Unnamed: 0,Planet,Mass,Diameter,DayLength
0,MERCURY,0.33,4879,4222.6
1,VENUS,4.87,12104,2802.0
2,EARTH,5.97,12756,24.0
3,MOON,0.073,3475,708.7
4,MARS,0.642,6792,24.7
5,JUPITER,1898.0,142984,9.9
6,SATURN,568.0,120536,10.7
7,URANUS,86.8,51118,17.2
8,NEPTUNE,102.0,49528,16.1
9,PLUTO,0.0146,2370,153.3


In [7]:
planets['Mass']

0       0.3300
1       4.8700
2       5.9700
3       0.0730
4       0.6420
5    1898.0000
6     568.0000
7      86.8000
8     102.0000
9       0.0146
Name: Mass, dtype: float64

In [8]:
planets.Mass

0       0.3300
1       4.8700
2       5.9700
3       0.0730
4       0.6420
5    1898.0000
6     568.0000
7      86.8000
8     102.0000
9       0.0146
Name: Mass, dtype: float64

In [10]:
planets.index

RangeIndex(start=0, stop=10, step=1)

In [9]:
#index-based slicing
planets.loc[0]

Planet       MERCURY
Mass            0.33
Diameter        4879
DayLength     4222.6
Name: 0, dtype: object

In [12]:
# Set planet names as indices,
planets.set_index('Planet')

Unnamed: 0_level_0,Mass,Diameter,DayLength
Planet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MERCURY,0.33,4879,4222.6
VENUS,4.87,12104,2802.0
EARTH,5.97,12756,24.0
MOON,0.073,3475,708.7
MARS,0.642,6792,24.7
JUPITER,1898.0,142984,9.9
SATURN,568.0,120536,10.7
URANUS,86.8,51118,17.2
NEPTUNE,102.0,49528,16.1
PLUTO,0.0146,2370,153.3


In [13]:
#Know that most operations in pandas result in copies of the df object and do not modify original
# if we do want to modify the dataframewe can use the .set_index method with the key word inplace=''

planets.set_index('Planet',inplace=True)

    

In [14]:
planets

Unnamed: 0_level_0,Mass,Diameter,DayLength
Planet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MERCURY,0.33,4879,4222.6
VENUS,4.87,12104,2802.0
EARTH,5.97,12756,24.0
MOON,0.073,3475,708.7
MARS,0.642,6792,24.7
JUPITER,1898.0,142984,9.9
SATURN,568.0,120536,10.7
URANUS,86.8,51118,17.2
NEPTUNE,102.0,49528,16.1
PLUTO,0.0146,2370,153.3


In [17]:
#find out how many rows in df
planets.info(), len(planets)

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, MERCURY to PLUTO
Data columns (total 3 columns):
Mass         10 non-null float64
Diameter     10 non-null object
DayLength    10 non-null float64
dtypes: float64(2), object(1)
memory usage: 320.0+ bytes


(None, 10)

In [18]:
planets = pd.read_csv('Planets.csv')
planets.set_index('Planet', inplace=True)

In [20]:
planets.FirstVisited['MERCURY']

'1974-03-29'

In [21]:
planets.loc['MERCURY','FirstVisited']

'1974-03-29'

In [22]:
type(planets.loc['MERCURY','FirstVisited'])

str

In [24]:
# Use the pandas fct 'to_datetime()' to convert string dates to smart datetime objects
pd.to_datetime(planets.FirstVisited)


Planet
MERCURY   1974-03-29
VENUS     1962-08-27
EARTH            NaT
MOON      1959-09-12
MARS      1965-07-15
JUPITER   1973-12-04
SATURN    1979-09-01
URANUS    1986-01-24
NEPTUNE   1989-08-25
PLUTO     2015-07-14
Name: FirstVisited, dtype: datetime64[ns]

In [25]:
# we can then assign the result of the above to the column itself
planets.FirstVisited = pd.to_datetime(planets.FirstVisited)

In [26]:
# then we can use the 'dt' method to do interesting things to the datetime information... the 
# code below will isolate the year so that you can operate on it
planets.FirstVisited.dt.year

Planet
MERCURY    1974.0
VENUS      1962.0
EARTH         NaN
MOON       1959.0
MARS       1965.0
JUPITER    1973.0
SATURN     1979.0
URANUS     1986.0
NEPTUNE    1989.0
PLUTO      2015.0
Name: FirstVisited, dtype: float64

In [30]:
# we can see, for example the amount of years since we visited:

yrssince = 2019 - planets.FirstVisited.dt.year
yrssince

Planet
MERCURY    45.0
VENUS      57.0
EARTH       NaN
MOON       60.0
MARS       54.0
JUPITER    46.0
SATURN     40.0
URANUS     33.0
NEPTUNE    30.0
PLUTO       4.0
Name: FirstVisited, dtype: float64

AttributeError: module 'pandas' has no attribute 'hist'

In [32]:
planets.FirstVisited.shape

(10,)

In [38]:
planets.index

Index(['MERCURY', 'VENUS', 'EARTH', 'MOON', 'MARS', 'JUPITER', 'SATURN',
       'URANUS', 'NEPTUNE', 'PLUTO'],
      dtype='object', name='Planet')