<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataframes Part 1
- Series is a 1D data structure, i.e., 1 column
  - Only need 1 identifier or reference point to extract a particular value
- DataFrame is 2D data structure, i.e., rows and columns
  - I.e., a regular table or spreadsheet or SQL output
  - Don't care about number of rows and columns; instead, we care about number of points of reference; need 2 identifiers or points of reference

## Intro

In [12]:
# libraries needed
import numpy as np
import pandas as pd

In [13]:
# load data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba
# note that index 457 is all missing

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [14]:
nba.shape # 458 rows

(458, 9)

In [15]:
nba['Name'].nunique # 458 unique playeers

<bound method IndexOpsMixin.nunique of 0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object>

In [16]:
# number of missing values per column
(
    nba
      .isna()
      .sum(axis = 'rows')
)

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [17]:
# whenever column has missing vlues and column is numeric, then datatype for that column is float

In [18]:
# number of non-missing values per column

(
    nba
      .notna()
      .sum(axis = 'rows')
)

Name        457
Team        457
Number      457
Position    457
Age         457
Height      457
Weight      457
College     373
Salary      446
dtype: int64

## Methods and Attributes between Series and DataFrames
- There are similarities (e.g., .shape) and differences (.columns)

In [20]:
# descriptive stats on numeric variables; this is a method
nba.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [22]:
# number of rows and columns
nba.shape
  # output is a tuple

(458, 9)

In [26]:
# create series
s = pd.Series(
    data = range(1, 6)   # from 1 thru 5, inclusive
)

s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [27]:
# view head and tail of dataframe
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [29]:
# view tail of dataframe
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [31]:
# view head of series
s.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [32]:
# view tail of series
s.tail()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [33]:
# view index of series
s.index

RangeIndex(start=0, stop=5, step=1)

In [34]:
# view index of dataframe
nba.index

RangeIndex(start=0, stop=458, step=1)

In [36]:
# view values in a series
s.values

array([1, 2, 3, 4, 5])

In [39]:
# view values in a dataframe
nba.values
  # list of lists

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [41]:
# shape of series
s.shape
  # 5 rows, 1-dimensional

(5,)

In [42]:
# shape of dataframe
nba.shape
  # 458 rows, 9 columns, 2-dimensional

(458, 9)

In [43]:
# datatype of values in series
s.dtype

dtype('int64')

In [49]:
# datatpe of vlaues in a dataframe
nba.dtypes
  # plural, dtypes, not singular
  # one for each column

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [52]:
# whether series has missing values
s.hasnans
  #.hasnans attributes not available in dataframe

False

In [53]:
# columns in dataframe
nba.columns
  # dataframe has 2 indices: rows and columns; rows are called index

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [55]:
# rows for nba
nba.index

RangeIndex(start=0, stop=458, step=1)

In [56]:
# a series is 1D, so there is no s.columns attribute

In [57]:
# for dataframe, the indexes
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [58]:
# for series, the axes
s.axes

[RangeIndex(start=0, stop=5, step=1)]

In [60]:
# summary of data structure
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [65]:
# can't call s.info(); get an error; but video shows this

## Differences between shared methods