<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/dataframes_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataframes Part 1
- Series is a 1D data structure, i.e., 1 column
  - Only need 1 identifier or reference point to extract a particular value
- DataFrame is 2D data structure, i.e., rows and columns
  - I.e., a regular table or spreadsheet or SQL output
  - Don't care about number of rows and columns; instead, we care about number of points of reference; need 2 identifiers or points of reference

## Intro

In [141]:
# libraries needed
import numpy as np
import pandas as pd

In [142]:
# load data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba
# note that index 457 is all missing

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [143]:
nba.shape # 458 rows

(458, 9)

In [144]:
nba['Name'].nunique # 458 unique playeers

<bound method IndexOpsMixin.nunique of 0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object>

In [145]:
# number of missing values per column
(
    nba
      .isna()
      .sum(axis = 'rows')
)

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [146]:
# whenever column has missing vlues and column is numeric, then datatype for that column is float

In [147]:
# number of non-missing values per column

(
    nba
      .notna()
      .sum(axis = 'rows')
)

Name        457
Team        457
Number      457
Position    457
Age         457
Height      457
Weight      457
College     373
Salary      446
dtype: int64

## Methods and Attributes between Series and DataFrames
- There are similarities (e.g., .shape) and differences (.columns)

In [148]:
# descriptive stats on numeric variables; this is a method
nba.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [149]:
# number of rows and columns
nba.shape
  # output is a tuple

(458, 9)

In [150]:
# create series
s = pd.Series(
    data = range(1, 6)   # from 1 thru 5, inclusive
)

s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [151]:
# view head and tail of dataframe
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [152]:
# view tail of dataframe
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [153]:
# view head of series
s.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [154]:
# view tail of series
s.tail()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [155]:
# view index of series
s.index

RangeIndex(start=0, stop=5, step=1)

In [156]:
# view index of dataframe
nba.index

RangeIndex(start=0, stop=458, step=1)

In [157]:
# view values in a series
s.values

array([1, 2, 3, 4, 5])

In [158]:
# view values in a dataframe
nba.values
  # list of lists

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [159]:
# shape of series
s.shape
  # 5 rows, 1-dimensional

(5,)

In [160]:
# shape of dataframe
nba.shape
  # 458 rows, 9 columns, 2-dimensional

(458, 9)

In [161]:
# datatype of values in series
s.dtype

dtype('int64')

In [162]:
# datatpe of vlaues in a dataframe
nba.dtypes
  # plural, dtypes, not singular
  # one for each column

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [163]:
# whether series has missing values
s.hasnans
  #.hasnans attributes not available in dataframe

False

In [164]:
# columns in dataframe
nba.columns
  # dataframe has 2 indices: rows and columns; rows are called index

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [165]:
# rows for nba
nba.index

RangeIndex(start=0, stop=458, step=1)

In [166]:
# a series is 1D, so there is no s.columns attribute

In [167]:
# for dataframe, the indexes
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [168]:
# for series, the axes
s.axes

[RangeIndex(start=0, stop=5, step=1)]

In [169]:
# summary of data structure
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [170]:
# can't call s.info(); get an error; but video shows this

## Differences between shared methods

In [171]:
revenue = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/revenue.csv', 
    index_col = 'Date'
)

revenue

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253
1/6/16,936,502,497
1/7/16,123,996,115
1/8/16,935,492,886
1/9/16,846,954,823
1/10/16,54,285,216


In [172]:
# get sum for each city; so iterating through the rows
revenue.sum(axis = 'index')

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [173]:
# above is equivalent to:
revenue.sum(axis = 0)

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [174]:
# get sum for each date, so iterating through the columns
revenue.sum(axis = 'columns')

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [175]:
# above is equivalent to:
revenue.sum(axis = 1)

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [176]:
# best to think of index (0) and columns (1) as axes

In [177]:
# repeat above, but with .mean() method
revenue.mean(axis = 'columns') # average revenue per day

Date
1/1/16     535.333333
1/2/16     686.666667
1/3/16     322.333333
1/4/16     839.666667
1/5/16     146.000000
1/6/16     645.000000
1/7/16     411.333333
1/8/16     771.000000
1/9/16     874.333333
1/10/16    185.000000
dtype: float64

In [178]:
# average revenue per city
revenue.mean(axis = 'index')

New York       547.5
Los Angeles    513.4
Miami          564.1
dtype: float64

In [179]:
# dealing with 2D data structure, so need to specify how to sum or find mean

## Select one column from a Pandas DataFrame
- equivalient to R tidyverse dplyr::select()

In [180]:
nba.Name # this is a pandas series

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [181]:
type(nba.Name)

pandas.core.series.Series

In [182]:
# above not recommended in case column name shares the same name as a method or attribute

In [183]:
# another example
nba.Salary

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [184]:
# above not preferred if column has space in it

In [185]:
# recommended approach; can accomodate column names with spaces
nba['Name']

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [186]:
# another example
nba['Salary']

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [187]:
# above is case sensitive

In [188]:
# chaining
(
    nba['Salary']                        # this is a Series
      .head()
      .sort_values(ascending = False)
)

0    7730337.0
1    6796117.0
4    5000000.0
3    1148640.0
2          NaN
Name: Salary, dtype: float64

## Select multiple columns from Pandas DataFrame
- equivalent to R tidyverse dplyr::select()

In [189]:
# columns
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [190]:
nba[['Name', 'Team']]

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
...,...,...
453,Shelvin Mack,Utah Jazz
454,Raul Neto,Utah Jazz
455,Tibor Pleiss,Utah Jazz
456,Jeff Withey,Utah Jazz


In [191]:
# order matters
nba[['Team', 'Name']]

Unnamed: 0,Team,Name
0,Boston Celtics,Avery Bradley
1,Boston Celtics,Jae Crowder
2,Boston Celtics,John Holland
3,Boston Celtics,R.J. Hunter
4,Boston Celtics,Jonas Jerebko
...,...,...
453,Utah Jazz,Shelvin Mack
454,Utah Jazz,Raul Neto
455,Utah Jazz,Tibor Pleiss
456,Utah Jazz,Jeff Withey


In [192]:
# Name, Team, Number
nba.loc[:, 'Name':'Number']    # all rows (:), columns Name thru Number,inclusive

Unnamed: 0,Name,Team,Number
0,Avery Bradley,Boston Celtics,0.0
1,Jae Crowder,Boston Celtics,99.0
2,John Holland,Boston Celtics,30.0
3,R.J. Hunter,Boston Celtics,28.0
4,Jonas Jerebko,Boston Celtics,8.0
...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0
454,Raul Neto,Utah Jazz,25.0
455,Tibor Pleiss,Utah Jazz,21.0
456,Jeff Withey,Utah Jazz,24.0


In [193]:
# save columns to extract as a separate variable
columns_to_select = ['Name', 'Team', 'Number']

nba[columns_to_select]

Unnamed: 0,Name,Team,Number
0,Avery Bradley,Boston Celtics,0.0
1,Jae Crowder,Boston Celtics,99.0
2,John Holland,Boston Celtics,30.0
3,R.J. Hunter,Boston Celtics,28.0
4,Jonas Jerebko,Boston Celtics,8.0
...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0
454,Raul Neto,Utah Jazz,25.0
455,Tibor Pleiss,Utah Jazz,21.0
456,Jeff Withey,Utah Jazz,24.0


In [194]:
# when selecting multiple columns, output is a DataFrame

## Add a new column to a DataFrame

In [195]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [196]:
# add a new column indicating sport played by player
nba['Sport'] = 'Basketball'

# examine
nba 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,Basketball
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,Basketball
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,Basketball


In [197]:
# add a new column indicating league
nba['League'] = 'NBA'

In [198]:
# examine
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,NBA
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,NBA
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,NBA
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,NBA
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,NBA
...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball,NBA
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,Basketball,NBA
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,Basketball,NBA
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,Basketball,NBA


In [199]:
# examine
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,NBA
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,NBA
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,NBA
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,NBA
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,NBA
...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball,NBA
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,Basketball,NBA
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,Basketball,NBA
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,Basketball,NBA


In [200]:
type(nba)

pandas.core.frame.DataFrame

In [201]:
# drop Sport and League
nba.drop(columns = ['Sport', 'League'], inplace = True)   # if you reassign, then it's no longer a Pandas DataFrame..hmmmm

In [202]:
# view
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [203]:
# insert Sport column after Name
nba.insert(loc = 3, column = 'Sport', value = 'Basketball')

# examine
nba

Unnamed: 0,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,Basketball,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,Basketball,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,Basketball,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,Basketball,C,26.0,7-0,231.0,Kansas,947276.0


In [204]:
nba.insert(loc = 0, column = 'League', value = 'NBA')

In [205]:
# examine
nba

Unnamed: 0,League,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary
0,NBA,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0
1,NBA,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0
2,NBA,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,
3,NBA,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,NBA,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...,...,...
453,NBA,Shelvin Mack,Utah Jazz,8.0,Basketball,PG,26.0,6-3,203.0,Butler,2433333.0
454,NBA,Raul Neto,Utah Jazz,25.0,Basketball,PG,24.0,6-1,179.0,,900000.0
455,NBA,Tibor Pleiss,Utah Jazz,21.0,Basketball,C,26.0,7-3,256.0,,2900000.0
456,NBA,Jeff Withey,Utah Jazz,24.0,Basketball,C,26.0,7-0,231.0,Kansas,947276.0


In [206]:
# .insert() method seems to do it inplace = True, even though there is no argument

## Create a new column from exisiting data

In [207]:
# age in 10 years
nba['Age_In_10_Years'] = nba['Age'] + 10
  # note that right hand side of equal sign is evaluated first

In [208]:
# examine
nba.head()

Unnamed: 0,League,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary,Age_In_10_Years
0,NBA,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0,35.0
1,NBA,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0
2,NBA,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,,37.0
3,NBA,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0
4,NBA,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0,39.0


In [209]:
# another way
nba['Age_In_10_Years'] = nba['Age'].add(10)

# examine
nba.head()

Unnamed: 0,League,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary,Age_In_10_Years
0,NBA,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0,35.0
1,NBA,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0
2,NBA,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,,37.0
3,NBA,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0
4,NBA,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0,39.0


In [210]:
# subtract 5 million from salary
nba['Salary'].subtract(5_000_000)

0      2730337.0
1      1796117.0
2            NaN
3     -3851360.0
4            0.0
         ...    
453   -2566667.0
454   -4100000.0
455   -2100000.0
456   -4052724.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [211]:
# assign to a new variable in nba
nba['Salary_Less_5_Million'] = nba['Salary'].subtract(5_000_000)

# examine
nba.head()

Unnamed: 0,League,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary,Age_In_10_Years,Salary_Less_5_Million
0,NBA,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0,35.0,2730337.0
1,NBA,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0,1796117.0
2,NBA,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,,37.0,
3,NBA,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0,-3851360.0
4,NBA,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0,39.0,0.0


In [212]:
# new column, weight in kg; 1 lb = 0.4535 kg
nba['Weight_kg'] = nba['Weight'].mul(0.4535)

# examine
nba.head()

Unnamed: 0,League,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary,Age_In_10_Years,Salary_Less_5_Million,Weight_kg
0,NBA,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0,35.0,2730337.0,81.63
1,NBA,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0,1796117.0,106.5725
2,NBA,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,,37.0,,92.9675
3,NBA,R.J. Hunter,Boston Celtics,28.0,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0,-3851360.0,83.8975
4,NBA,Jonas Jerebko,Boston Celtics,8.0,Basketball,PF,29.0,6-10,231.0,,5000000.0,39.0,0.0,104.7585


## Review of .value_counts() method

In [213]:
# get nba dataset
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [214]:
# this isn't that informative
nba.value_counts()

Name                    Team                   Number  Position  Age   Height  Weight  College         Salary    
Aaron Brooks            Chicago Bulls          0.0     PG        31.0  6-0     161.0   Oregon          2250000.0     1
Mike Muscala            Atlanta Hawks          31.0    PF        24.0  6-11    240.0   Bucknell        947276.0      1
Mike Dunleavy           Chicago Bulls          34.0    SG        35.0  6-9     230.0   Duke            4500000.0     1
Mike Conley             Memphis Grizzlies      11.0    PG        28.0  6-1     175.0   Ohio State      9588426.0     1
Michael Kidd-Gilchrist  Charlotte Hornets      14.0    SF        22.0  6-7     232.0   Kentucky        6331404.0     1
                                                                                                                    ..
Hassan Whiteside        Miami Heat             21.0    C         26.0  7-0     265.0   Marshall        981348.0      1
Harrison Barnes         Golden State Warriors  40.0  

In [215]:
# most popular position
(
    nba['Position']
      .value_counts(normalize = True)
      .mul(100)
)

# SG = shooting guard

SG    22.319475
PF    21.881838
PG    20.131291
SF    18.599562
C     17.067834
Name: Position, dtype: float64

In [216]:
# most common salary
(
    nba['Salary']
      .value_counts()
)

947276.0      31
845059.0      18
525093.0      13
981348.0       6
16407500.0     5
              ..
2100000.0      1
1252440.0      1
2891760.0      1
3272091.0      1
900000.0       1
Name: Salary, Length: 309, dtype: int64

## .dropna() method; drop rows with missing values

In [217]:
# last row has missing values
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [218]:
(
    nba
      .isna()
      .sum(axis = 'columns')
      .sort_values(ascending = False)
)

457    9
397    2
353    2
350    1
358    1
      ..
165    0
164    0
163    0
162    0
229    0
Length: 458, dtype: int64

In [219]:
(
  nba.dropna()  # by default, will remove any row with missing data
)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [220]:
# only remove completely blank rows
nba.dropna(how = 'all')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [221]:
# only remove rows where College is missing
nba.dropna(subset = ['College'])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [222]:
# remove rows if missing value in College or Salary
nba.dropna(subset = ['College', 'Salary'])           # only include rows where College and Salary are non-null

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


## Fill in missing values with .fillna() method

In [223]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [224]:
# replace every missing value with 0; doesn't make sense
nba.fillna(value = 0)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,0,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,0,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [225]:
# in college column, replace missing with unknown
nba['College'] = nba['College'].fillna('Unknown')
  # instead of inplace = True, which isn't recommended and planned to be deprecated

In [226]:
# examine
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Unknown,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Unknown,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [227]:
# replace missing salary with average salary
nba['Salary'].isna().sum()
  # 12 missing values

12

In [228]:
nba['Salary'] = nba['Salary'].fillna(value = nba['Salary'].mean())

In [229]:
nba['Salary'].isna().sum()

0

In [230]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7.730337e+06
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6.796117e+06
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,4.842684e+06
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1.148640e+06
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5.000000e+06
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2.433333e+06
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Unknown,9.000000e+05
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Unknown,2.900000e+06
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,9.472760e+05


## The .astype() method, part 1

In [231]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7.730337e+06
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6.796117e+06
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,4.842684e+06
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1.148640e+06
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5.000000e+06
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2.433333e+06
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Unknown,9.000000e+05
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Unknown,2.900000e+06
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,9.472760e+05


In [232]:
# are there missing values in Age column?
nba['Age'].hasnans
  # Yes, since output is True

True

In [233]:
# how many missing values in Age column?
nba['Age'].isna().sum()
  # just 1

1

In [234]:
# drop rows where values are all missing
nba = nba.dropna(how = 'all')
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7.730337e+06
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6.796117e+06
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,4.842684e+06
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1.148640e+06
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5.000000e+06
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2.433333e+06
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Unknown,9.000000e+05
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Unknown,2.900000e+06
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,9.472760e+05


In [235]:
# remove last row
nba = nba.iloc[:457,]
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7.730337e+06
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6.796117e+06
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,4.842684e+06
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1.148640e+06
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Unknown,5.000000e+06
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2.239800e+06
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2.433333e+06
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Unknown,9.000000e+05
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Unknown,2.900000e+06


In [236]:
# Age column is a float
nba['Age'].dtype

dtype('float64')

In [237]:
# convert Age to int
nba['Age'].astype('int')

0      25
1      25
2      27
3      22
4      29
       ..
452    20
453    26
454    24
455    26
456    26
Name: Age, Length: 457, dtype: int64

In [238]:
# doesn't change in place
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [239]:
nba['Age'] = nba['Age'].astype('int')

# view
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    int64  
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    float64
dtypes: float64(3), int64(1), object(5)
memory usage: 35.7+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba['Age'] = nba['Age'].astype('int')


In [240]:
# if a column/series has missing values, then can't convert to int; must be a float

In [241]:
# .astype('str') to convert to string
# .astype('int') to convert to integer

## The .astype() method, Part 2
- category type, saves memory; uses less memory
- when there are relatively few unique values relative to total number of rows

In [242]:
# reload data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [243]:
# remove last row, which has all missing values
nba = nba.dropna(how = 'all')
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [244]:
# when column/series has a few number of unique values, e.g.,
  # month
  # blood type
  # gender
  # Position in nba dataset

In [245]:
# data type
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [246]:
# unique values of Position
print(nba['Position'].unique())    # show the unique values
print(nba['Position'].nunique())   # number of unique values

['PG' 'SF' 'SG' 'PF' 'C']
5


In [247]:
nba['Position'].nunique(dropna = False) # defaults to True; this is the only parameter in .nunique() method

5

In [248]:
nba['Position'].hasnans # no missing values in Position

False

In [249]:
# if you just want to see data types of each column/series
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [250]:
# more info than just data type; also get non-null count
nba.info()
  # also get memory usage at bottom

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [251]:
# get null count for each column/series
nba.isnull().sum(axis = 'index')

Name         0
Team         0
Number       0
Position     0
Age          0
Height       0
Weight       0
College     84
Salary      11
dtype: int64

In [252]:
# get non null count for each column/series
nba.notnull().sum(axis = 'index')

Name        457
Team        457
Number      457
Position    457
Age         457
Height      457
Weight      457
College     373
Salary      446
dtype: int64

In [253]:
# convert Position to category
nba['Position'].astype('category')

0      PG
1      SF
2      SG
3      SG
4      PF
       ..
452    PF
453    PG
454    PG
455     C
456     C
Name: Position, Length: 457, dtype: category
Categories (5, object): ['C', 'PF', 'PG', 'SF', 'SG']

In [254]:
# above will reduce memory

In [255]:
nba['Position'] = nba['Position'].astype('category')

# view memory usage
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    object  
 2   Number    457 non-null    float64 
 3   Position  457 non-null    category
 4   Age       457 non-null    float64 
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   373 non-null    object  
 8   Salary    446 non-null    float64 
dtypes: category(1), float64(4), object(4)
memory usage: 32.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba['Position'] = nba['Position'].astype('category')


In [256]:
# convert Team to category

# how many unique teams
nba['Team'].nunique(dropna = False)   # 30 unique values

30

In [257]:
# any missing?
nba['Team'].hasnans
  # no, none missing

False

In [258]:
nba['Team'] = nba['Team'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba['Team'] = nba['Team'].astype('category')


In [259]:
# now Team is a category
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    float64 
 3   Position  457 non-null    category
 4   Age       457 non-null    float64 
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   373 non-null    object  
 8   Salary    446 non-null    float64 
dtypes: category(2), float64(4), object(3)
memory usage: 30.9+ KB


## Sort a DataFrame with the .sort_values() method, Part 1

In [260]:
# reload data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [261]:
# sort by name
nba.sort_values(by = ['Name'], ascending = True)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
...,...,...,...,...,...,...,...,...,...
270,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,
402,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
271,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
237,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0


In [262]:
# sort by Age
nba.sort_values(by = 'Age', ascending = False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,Utah,250750.0
400,Kevin Garnett,Minnesota Timberwolves,21.0,PF,40.0,6-11,240.0,,8500000.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,Wake Forest,5250000.0
261,Vince Carter,Memphis Grizzlies,15.0,SG,39.0,6-6,220.0,North Carolina,4088019.0
102,Pablo Prigioni,Los Angeles Clippers,9.0,PG,39.0,6-3,185.0,,947726.0
...,...,...,...,...,...,...,...,...,...
401,Tyus Jones,Minnesota Timberwolves,1.0,PG,20.0,6-2,195.0,Duke,1282080.0
60,Christian Wood,Philadelphia 76ers,35.0,PF,20.0,6-11,220.0,UNLV,525093.0
226,Rashad Vaughn,Milwaukee Bucks,20.0,SG,19.0,6-6,202.0,UNLV,1733040.0
122,Devin Booker,Phoenix Suns,1.0,SG,19.0,6-6,206.0,Kentucky,2127840.0


In [263]:
# sort by Salary
nba.sort_values(by = 'Salary', na_position = 'first') # by default, missing values are clustered 'last'; change to 'first' here# 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
46,Elton Brand,Philadelphia 76ers,42.0,PF,37.0,6-9,254.0,Duke,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,
269,Ray McCallum,Memphis Grizzlies,5.0,PG,24.0,6-3,190.0,Detroit,
...,...,...,...,...,...,...,...,...,...
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730.0
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0


In [264]:
# make nba DataFrame sorted by Age alphabtically ascending
nba = (
    nba
      .sort_values(by = 'Name')
      .reset_index(drop = True)    # reset the index to be 0, 1, 2, 3, etc.; drop = True prevents the current index from becoming a column
)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
1,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
2,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
3,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
4,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
...,...,...,...,...,...,...,...,...,...
453,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,
454,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
455,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
456,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0


## .sort_values() method, Part 2
- Use list to sort by multiple columns
  - by and ascending parameters; list for each as the argument

In [265]:
# reload data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [266]:
# remove last row
nba = nba.dropna(how = 'all')
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [267]:
# sort by Team ASC and Name ASC
# SQL ORDER BY Team ASC, Name ASC

(
    nba
      .sort_values(by = ['Team', 'Name'], ascending = [True, True])     # better for more granular control over sorting
)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22.0,6-1,172.0,,1763400.0
323,Jeff Teague,Atlanta Hawks,0.0,PG,27.0,6-2,186.0,Wake Forest,8000000.0
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
311,Kirk Hinrich,Atlanta Hawks,12.0,SG,35.0,6-4,190.0,Kansas,2854940.0
...,...,...,...,...,...,...,...,...,...
381,Marcus Thornton,Washington Wizards,15.0,SF,29.0,6-4,205.0,LSU,200600.0
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0


In [268]:
# sort by multiple columns, all ASC
# SQL ORDER BY 1, 2, 3, 4, etc.

(
    nba
      .sort_values(by = ['Team', 'Position', 'Salary'], ascending = True)    # sort Team, Position, and Salary all ascending
)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
316,Mike Muscala,Atlanta Hawks,31.0,PF,24.0,6-11,240.0,Bucknell,947276.0
313,Kris Humphries,Atlanta Hawks,43.0,PF,31.0,6-9,235.0,Minnesota,1000000.0
...,...,...,...,...,...,...,...,...,...
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
371,Jarell Eddie,Washington Wizards,8.0,SG,24.0,6-7,218.0,Virginia Tech,561716.0
380,Garrett Temple,Washington Wizards,17.0,SG,30.0,6-6,195.0,LSU,1100602.0
368,Alan Anderson,Washington Wizards,6.0,SG,33.0,6-6,220.0,Michigan State,4000000.0


## .sort_index() method
- Basically undoes .sort_values()

In [269]:
# load data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

# remove last row which is all null
nba = nba.dropna(how = 'all')

# examine
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [270]:
# sort by Number and Age, ASC
(
    nba
      .sort_values(by = ['Number', 'Age'], ascending = True)   # SQL ORDER BY Number ASC, Age ASC
      .sort_index(ascending = True)                            # by default it's True for ascending parameter
)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


## .rank() method

In [272]:
# reload data
nba = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/nba.csv'
)

# remove last row which is all null
nba = nba.dropna(how = 'all')

# examine
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [274]:
# missing values in Salary column/Series, replace with 0
nba['Salary'] = nba['Salary'].fillna(0)
nba['Salary'].hasnans

False

In [277]:
# convert Salary from float to int
nba['Salary'] = nba['Salary'].astype('int')
nba['Salary'].dtype

dtype('int64')

In [282]:
# rank Salary from highest (1) to lowest; save as own column/Series called Salary_Rank
nba['Salary_Rank'] = (
    nba['Salary']
      .rank(ascending = False)
      .astype('int')
)

# examine
nba.sort_values(by = 'Salary', ascending = False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary_Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
...,...,...,...,...,...,...,...,...,...,...
353,Dorell Wright,Miami Heat,11.0,SF,30.0,6-9,205.0,,0,452
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,0,452
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,0,452
273,Alex Stepheson,Memphis Grizzlies,35.0,PF,28.0,6-10,270.0,USC,0,452


In [None]:
## done with this section