[< index](README.md)
# 16 - Pandas (basics)

In [1]:
# Pandas is a library to do analysis on data sets
# You need to import pandas, but you give it another name, because programmers are lazy
import pandas as pd # Note 'as pd', you now use 'pd' instead of 'pandas'

# You've already seen many variable types, most prominently dicts and lists, pandas introduces its own
# datatypes: the one-dimensional Series, and the multi-dimensional DataFrame.

# Let's look at the Series first, you can create a series from any list of values
data = [5, 1, 8, 2, 2, 5, 2, 3, 6, 2, 3, 9, 1, 12, 2]
series = pd.Series(data)

# Pandas automatically creates an index starting at zero, just like a regular list
series

0      5
1      1
2      8
3      2
4      2
5      5
6      2
7      3
8      6
9      2
10     3
11     9
12     1
13    12
14     2
dtype: int64

In [2]:
# To get just the values again use the 'values' attribute
series.values

array([ 5,  1,  8,  2,  2,  5,  2,  3,  6,  2,  3,  9,  1, 12,  2])

In [3]:
# And for indexes, just use 'index'
series.index

RangeIndex(start=0, stop=15, step=1)

In [4]:
# Pandas has a lot of methods you can use on its Series and DataFrames. 
print(series.max()) # Largest value

12


In [5]:
print(series.min()) # Smallest value

1


In [6]:
print(series.mean()) # Average value

4.2


In [7]:
print(series.median()) # Median value

3.0


In [8]:
print(series.sum()) # Sum of all values

63


In [9]:
# Another really useful feature is value_counts(), which counts the times a value
# occurs in a series
print(series.value_counts())

2     5
5     2
3     2
1     2
12    1
9     1
8     1
6     1
dtype: int64


In [10]:
# Using the head() and tail() methods you can view the first and last entries,
# by default the first five (head) or last five (tail)
series.head()

0    5
1    1
2    8
3    2
4    2
dtype: int64

In [11]:
# If you give a number in head() or tail() you're getting that many entries
series.tail(10)

5      5
6      2
7      3
8      6
9      2
10     3
11     9
12     1
13    12
14     2
dtype: int64

In [12]:
# DataFrames add one or more Series together, a lot like an Excel sheet. Or in 
# Python terms: a list with dicts
data = [
    { "name" : "Tinus", "species" : "hamster", "age" : 7 }, # Note how we put this dict on one line for brevity
    { "name" : "Barrie", "species" : "monkey", "age" : 5 },
    { "name" : "Hans", "species" : "badger", "age" : 12 }
]

df = pd.DataFrame(data) # 'df' is another convention, an abbrevation for dataframe
df # This will show a fancy table in Jupyter Notebooks

Unnamed: 0,age,name,species
0,7,Tinus,hamster
1,5,Barrie,monkey
2,12,Hans,badger


In [13]:
# To inspect one column only, you can use the [] notation, just like a dict or list
df["age"]

0     7
1     5
2    12
Name: age, dtype: int64

In [14]:
# And because this column is a series, you can use the same methods that we've used before
print(df["age"].mean())
print(df["age"].sum())

8.0
24
