## Introduce the Series and the DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
### check the environment
%whos

Variable   Type      Data/Info
------------------------------
np         module    <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
pd         module    <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>


### The Series object

In [3]:
### a list, of grocery items

grocery_list = ['milk', 'bananas', 'apples', 'lunch meat', 'soup', 'oreos']

In [4]:
print( type(grocery_list) )

<class 'list'>


In [5]:
print( len(grocery_list) )

6


Convert our `list` object into a `Series` object.

In [6]:
grocery_series = pd.Series(grocery_list)

In [7]:
%whos

Variable         Type      Data/Info
------------------------------------
grocery_list     list      n=6
grocery_series   Series    0          milk\n1       <...>     oreos\ndtype: object
np               module    <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
pd               module    <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>


In [8]:
print( type(grocery_series) )

<class 'pandas.core.series.Series'>


In [9]:
print( grocery_series )

0          milk
1       bananas
2        apples
3    lunch meat
4          soup
5         oreos
dtype: object


In [10]:
grocery_series

0          milk
1       bananas
2        apples
3    lunch meat
4          soup
5         oreos
dtype: object

In [11]:
### contrast to the list object
print( grocery_list)

['milk', 'bananas', 'apples', 'lunch meat', 'soup', 'oreos']


In [12]:
### we can access the values in the series through the index
print( grocery_series.index )

RangeIndex(start=0, stop=6, step=1)


In [13]:
print( type( range(6) ) )

<class 'range'>


### Change the series index

In [14]:
str_index = [str(n) for n in range(6)]

print( str_index )

['0', '1', '2', '3', '4', '5']


In [15]:
print( type(str_index[0] ) )

<class 'str'>


In [16]:
grocery_series.index = str_index

In [17]:
print( grocery_series.index )

Index(['0', '1', '2', '3', '4', '5'], dtype='object')


In [18]:
print( grocery_series )

0          milk
1       bananas
2        apples
3    lunch meat
4          soup
5         oreos
dtype: object


In [19]:
grocery_series[0]

'milk'

In [20]:
grocery_series['0']

'milk'

In [21]:
print( grocery_series.keys() )

Index(['0', '1', '2', '3', '4', '5'], dtype='object')


### Define the index when we create the Series

In [22]:
more_groceries = pd.Series(['apple juice', 'poptarts', 'butter', 'yogurt'],
                          index=['item 1', 'item 2', 'item 3', 'item D'])

In [23]:
more_groceries

item 1    apple juice
item 2       poptarts
item 3         butter
item D         yogurt
dtype: object

In [24]:
more_groceries[0]

'apple juice'

In [25]:
more_groceries['item 1']

'apple juice'

In [26]:
more_groceries['item D']

'yogurt'

In [27]:
more_groceries.index

Index(['item 1', 'item 2', 'item 3', 'item D'], dtype='object')

In [28]:
pd.Index(more_groceries) ### need to double check

Index(['apple juice', 'poptarts', 'butter', 'yogurt'], dtype='object')

In [29]:
more_groceries

item 1    apple juice
item 2       poptarts
item 3         butter
item D         yogurt
dtype: object

In [30]:
### make a series with two index values the same
another_series = pd.Series(['a', 'b', 'c'],
                          index=['1', '2', '2'])

In [31]:
another_series

1    a
2    b
2    c
dtype: object

In [32]:
another_series['2']

2    b
2    c
dtype: object

### The DataFrame

In [33]:
baseball_dict = {'City': ['Pittsburgh', 'Cincinnati', 'Chicago', 'St. Louis', 'Milwaukee'],
                'Team': ['Pirates', 'Reds', 'Cubs', 'Cardinals', 'Brewers'],
                'Division': 5 * ['Central'],
                'League': 5 * ['NL']}

print( baseball_dict )

{'City': ['Pittsburgh', 'Cincinnati', 'Chicago', 'St. Louis', 'Milwaukee'], 'Team': ['Pirates', 'Reds', 'Cubs', 'Cardinals', 'Brewers'], 'Division': ['Central', 'Central', 'Central', 'Central', 'Central'], 'League': ['NL', 'NL', 'NL', 'NL', 'NL']}


In [34]:
### convert to a DataFrame
baseball_df = pd.DataFrame(baseball_dict)

In [35]:
print( baseball_df )

         City       Team Division League
0  Pittsburgh    Pirates  Central     NL
1  Cincinnati       Reds  Central     NL
2     Chicago       Cubs  Central     NL
3   St. Louis  Cardinals  Central     NL
4   Milwaukee    Brewers  Central     NL


### Attributes

In [36]:
baseball_df.shape

(5, 4)

In [37]:
baseball_df

Unnamed: 0,City,Team,Division,League
0,Pittsburgh,Pirates,Central,NL
1,Cincinnati,Reds,Central,NL
2,Chicago,Cubs,Central,NL
3,St. Louis,Cardinals,Central,NL
4,Milwaukee,Brewers,Central,NL


In [44]:
baseball_df.columns

Index(['City', 'Team', 'Division', 'League'], dtype='object')

In [45]:
baseball_df.index

RangeIndex(start=0, stop=5, step=1)

### Methods

In [38]:
baseball_df.head()

Unnamed: 0,City,Team,Division,League
0,Pittsburgh,Pirates,Central,NL
1,Cincinnati,Reds,Central,NL
2,Chicago,Cubs,Central,NL
3,St. Louis,Cardinals,Central,NL
4,Milwaukee,Brewers,Central,NL


In [39]:
baseball_df.tail()

Unnamed: 0,City,Team,Division,League
0,Pittsburgh,Pirates,Central,NL
1,Cincinnati,Reds,Central,NL
2,Chicago,Cubs,Central,NL
3,St. Louis,Cardinals,Central,NL
4,Milwaukee,Brewers,Central,NL


In [40]:
baseball_df.head(2)

Unnamed: 0,City,Team,Division,League
0,Pittsburgh,Pirates,Central,NL
1,Cincinnati,Reds,Central,NL


In [42]:
baseball_df.tail(2)

Unnamed: 0,City,Team,Division,League
3,St. Louis,Cardinals,Central,NL
4,Milwaukee,Brewers,Central,NL


In [43]:
baseball_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   City      5 non-null      object
 1   Team      5 non-null      object
 2   Division  5 non-null      object
 3   League    5 non-null      object
dtypes: object(4)
memory usage: 288.0+ bytes


In [46]:
baseball_df.describe()

Unnamed: 0,City,Team,Division,League
count,5,5,5,5
unique,5,5,1,1
top,Milwaukee,Brewers,Central,NL
freq,1,1,5,5


### Create a DataFrame again

In [47]:
baseball_df_b = pd.DataFrame(
data ={'City': ['Pittsburgh', 'Cincinnati', 'Chicago', 'St. Louis', 'Milwaukee'],
                'Team': ['Pirates', 'Reds', 'Cubs', 'Cardinals', 'Brewers'],
                'Division': 5 * ['Central'],
                'League': 5 * ['NL']},
index = [9.5, 5.5, 0, 2.0, 4.5],
columns = ['League', 'Division', 'City', 'Team'])

In [48]:
print( type(baseball_df_b ) )

<class 'pandas.core.frame.DataFrame'>


In [50]:
baseball_df_b

Unnamed: 0,League,Division,City,Team
9.5,NL,Central,Pittsburgh,Pirates
5.5,NL,Central,Cincinnati,Reds
0.0,NL,Central,Chicago,Cubs
2.0,NL,Central,St. Louis,Cardinals
4.5,NL,Central,Milwaukee,Brewers


In [51]:
print( baseball_df_b.index )

Float64Index([9.5, 5.5, 0.0, 2.0, 4.5], dtype='float64')


### Subset rows

In [52]:
baseball_df_b[0]

KeyError: 0

In [53]:
baseball_df_b.loc[0.0]

League           NL
Division    Central
City        Chicago
Team           Cubs
Name: 0.0, dtype: object

In [54]:
### use the index integer
baseball_df_b.iloc[0]

League              NL
Division       Central
City        Pittsburgh
Team           Pirates
Name: 9.5, dtype: object

In [55]:
baseball_df_b

Unnamed: 0,League,Division,City,Team
9.5,NL,Central,Pittsburgh,Pirates
5.5,NL,Central,Cincinnati,Reds
0.0,NL,Central,Chicago,Cubs
2.0,NL,Central,St. Louis,Cardinals
4.5,NL,Central,Milwaukee,Brewers


In [56]:
### subset the first row by the INDEX LABEL
baseball_df_b.loc[5.5]

League              NL
Division       Central
City        Cincinnati
Team              Reds
Name: 5.5, dtype: object

In [57]:
### subset by the first row INDEX INTEGER
baseball_df_b.iloc[1]

League              NL
Division       Central
City        Cincinnati
Team              Reds
Name: 5.5, dtype: object

### Extract or Select a Column

In [58]:
baseball_df_b['Team']

9.5      Pirates
5.5         Reds
0.0         Cubs
2.0    Cardinals
4.5      Brewers
Name: Team, dtype: object

In [59]:
team_object = baseball_df_b['Team']

In [60]:
print( type(team_object) )

<class 'pandas.core.series.Series'>


In [64]:
team_object.columns ### error out there are NO columns in a SERIES

AttributeError: 'Series' object has no attribute 'columns'