<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataframes Part 1
- Series is a 1D data structure, i.e., 1 column
  - Only need 1 identifier or reference point to extract a particular value
- DataFrame is 2D data structure, i.e., rows and columns
  - I.e., a regular table or spreadsheet or SQL output
  - Don't care about number of rows and columns; instead, we care about number of points of reference; need 2 identifiers or points of reference

## Intro

In [2]:
# libraries needed
import numpy as np
import pandas as pd

In [3]:
# load data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba
# note that index 457 is all missing

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [4]:
nba.shape # 458 rows

(458, 9)

In [5]:
nba['Name'].nunique # 458 unique playeers

<bound method IndexOpsMixin.nunique of 0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object>

In [6]:
# number of missing values per column
(
    nba
      .isna()
      .sum(axis = 'rows')
)

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [7]:
# whenever column has missing vlues and column is numeric, then datatype for that column is float

In [8]:
# number of non-missing values per column

(
    nba
      .notna()
      .sum(axis = 'rows')
)

Name        457
Team        457
Number      457
Position    457
Age         457
Height      457
Weight      457
College     373
Salary      446
dtype: int64

## Methods and Attributes between Series and DataFrames
- There are similarities (e.g., .shape) and differences (.columns)

In [9]:
# descriptive stats on numeric variables; this is a method
nba.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [10]:
# number of rows and columns
nba.shape
  # output is a tuple

(458, 9)

In [11]:
# create series
s = pd.Series(
    data = range(1, 6)   # from 1 thru 5, inclusive
)

s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [12]:
# view head and tail of dataframe
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [13]:
# view tail of dataframe
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [14]:
# view head of series
s.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [15]:
# view tail of series
s.tail()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [16]:
# view index of series
s.index

RangeIndex(start=0, stop=5, step=1)

In [17]:
# view index of dataframe
nba.index

RangeIndex(start=0, stop=458, step=1)

In [18]:
# view values in a series
s.values

array([1, 2, 3, 4, 5])

In [19]:
# view values in a dataframe
nba.values
  # list of lists

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [20]:
# shape of series
s.shape
  # 5 rows, 1-dimensional

(5,)

In [21]:
# shape of dataframe
nba.shape
  # 458 rows, 9 columns, 2-dimensional

(458, 9)

In [22]:
# datatype of values in series
s.dtype

dtype('int64')

In [23]:
# datatpe of vlaues in a dataframe
nba.dtypes
  # plural, dtypes, not singular
  # one for each column

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [24]:
# whether series has missing values
s.hasnans
  #.hasnans attributes not available in dataframe

False

In [25]:
# columns in dataframe
nba.columns
  # dataframe has 2 indices: rows and columns; rows are called index

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [26]:
# rows for nba
nba.index

RangeIndex(start=0, stop=458, step=1)

In [27]:
# a series is 1D, so there is no s.columns attribute

In [28]:
# for dataframe, the indexes
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [29]:
# for series, the axes
s.axes

[RangeIndex(start=0, stop=5, step=1)]

In [30]:
# summary of data structure
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [31]:
# can't call s.info(); get an error; but video shows this

## Differences between shared methods

In [32]:
revenue = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/revenue.csv', 
    index_col = 'Date'
)

revenue

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253
1/6/16,936,502,497
1/7/16,123,996,115
1/8/16,935,492,886
1/9/16,846,954,823
1/10/16,54,285,216


In [33]:
# get sum for each city; so iterating through the rows
revenue.sum(axis = 'index')

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [34]:
# above is equivalent to:
revenue.sum(axis = 0)

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [35]:
# get sum for each date, so iterating through the columns
revenue.sum(axis = 'columns')

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [36]:
# above is equivalent to:
revenue.sum(axis = 1)

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [37]:
# best to think of index (0) and columns (1) as axes

In [38]:
# repeat above, but with .mean() method
revenue.mean(axis = 'columns') # average revenue per day

Date
1/1/16     535.333333
1/2/16     686.666667
1/3/16     322.333333
1/4/16     839.666667
1/5/16     146.000000
1/6/16     645.000000
1/7/16     411.333333
1/8/16     771.000000
1/9/16     874.333333
1/10/16    185.000000
dtype: float64

In [39]:
# average revenue per city
revenue.mean(axis = 'index')

New York       547.5
Los Angeles    513.4
Miami          564.1
dtype: float64

In [40]:
# dealing with 2D data structure, so need to specify how to sum or find mean

## Select one column from a Pandas DataFrame
- equivalient to R tidyverse dplyr::select()

In [41]:
nba.Name # this is a pandas series

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [42]:
type(nba.Name)

pandas.core.series.Series

In [43]:
# above not recommended in case column name shares the same name as a method or attribute

In [44]:
# another example
nba.Salary

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [45]:
# above not preferred if column has space in it

In [46]:
# recommended approach; can accomodate column names with spaces
nba['Name']

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [47]:
# another example
nba['Salary']

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [48]:
# above is case sensitive

In [49]:
# chaining
(
    nba['Salary']                        # this is a Series
      .head()
      .sort_values(ascending = False)
)

0    7730337.0
1    6796117.0
4    5000000.0
3    1148640.0
2          NaN
Name: Salary, dtype: float64

## Select multiple columns from Pandas DataFrame
- equivalent to R tidyverse dplyr::select()

In [50]:
# columns
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [51]:
nba[['Name', 'Team']]

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
...,...,...
453,Shelvin Mack,Utah Jazz
454,Raul Neto,Utah Jazz
455,Tibor Pleiss,Utah Jazz
456,Jeff Withey,Utah Jazz


In [52]:
# order matters
nba[['Team', 'Name']]

Unnamed: 0,Team,Name
0,Boston Celtics,Avery Bradley
1,Boston Celtics,Jae Crowder
2,Boston Celtics,John Holland
3,Boston Celtics,R.J. Hunter
4,Boston Celtics,Jonas Jerebko
...,...,...
453,Utah Jazz,Shelvin Mack
454,Utah Jazz,Raul Neto
455,Utah Jazz,Tibor Pleiss
456,Utah Jazz,Jeff Withey


In [55]:
# Name, Team, Number
nba.loc[:, 'Name':'Number']    # all rows (:), columns Name thru Number,inclusive

Unnamed: 0,Name,Team,Number
0,Avery Bradley,Boston Celtics,0.0
1,Jae Crowder,Boston Celtics,99.0
2,John Holland,Boston Celtics,30.0
3,R.J. Hunter,Boston Celtics,28.0
4,Jonas Jerebko,Boston Celtics,8.0
...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0
454,Raul Neto,Utah Jazz,25.0
455,Tibor Pleiss,Utah Jazz,21.0
456,Jeff Withey,Utah Jazz,24.0


In [57]:
# save columns to extract as a separate variable
columns_to_select = ['Name', 'Team', 'Number']

nba[columns_to_select]

Unnamed: 0,Name,Team,Number
0,Avery Bradley,Boston Celtics,0.0
1,Jae Crowder,Boston Celtics,99.0
2,John Holland,Boston Celtics,30.0
3,R.J. Hunter,Boston Celtics,28.0
4,Jonas Jerebko,Boston Celtics,8.0
...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0
454,Raul Neto,Utah Jazz,25.0
455,Tibor Pleiss,Utah Jazz,21.0
456,Jeff Withey,Utah Jazz,24.0


In [None]:
# when selecting multiple columns, output is a DataFrame

## Add a new column to a DataFrame