In [2]:
import pandas as pd

### Broadcasting Operations

- Operations: add, multiply, subtract, divide

- Drop/Fill NA's

- Convert with .astype( )

In [3]:
# import the csv file as a data frame
nba = pd.read_csv('../datasets/nba.csv') 

nba.head(2)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0


In [4]:
# use addd( ) for values in a series
nba["Age"].add(5)

# can also use
nba["Age"] + 5

# subtract a value
nba['Age'].sub(5)

# or
nba["Age"] - 5

0      20.0
1      20.0
2      22.0
3      17.0
4      24.0
       ... 
453    21.0
454    19.0
455    21.0
456    21.0
457     NaN
Name: Age, Length: 458, dtype: float64

In [5]:
# multiply - turn weight from pounds to kilograms using mul() and make permanent new colum called kilograms
nba['Weight in Kilograms'] = nba['Weight'].mul(.45392)

In [6]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.7056
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.6712
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,93.0536
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.9752
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.85552


In [7]:
# create a new colum for representing salary in millions using div()
nba['Salary in Millions'] = nba['Salary'].div(1000000)

nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms,Salary in Millions
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.7056,7.730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.6712,6.796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,93.0536,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.9752,1.14864
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.85552,5.0


### Review of .value_counts( ) method

In [8]:
# count players by team
nba['Team'].value_counts()

New Orleans Pelicans      19
Memphis Grizzlies         18
New York Knicks           16
Milwaukee Bucks           16
Phoenix Suns              15
Miami Heat                15
Los Angeles Clippers      15
Denver Nuggets            15
Sacramento Kings          15
Los Angeles Lakers        15
Chicago Bulls             15
Atlanta Hawks             15
Boston Celtics            15
Houston Rockets           15
Indiana Pacers            15
Charlotte Hornets         15
Philadelphia 76ers        15
Golden State Warriors     15
Cleveland Cavaliers       15
Brooklyn Nets             15
Washington Wizards        15
Toronto Raptors           15
Portland Trail Blazers    15
San Antonio Spurs         15
Detroit Pistons           15
Utah Jazz                 15
Oklahoma City Thunder     15
Dallas Mavericks          15
Minnesota Timberwolves    14
Orlando Magic             14
Name: Team, dtype: int64

In [9]:
# count positions by player
nba['Position'].value_counts()

SG    102
PF    100
PG     92
SF     85
C      78
Name: Position, dtype: int64

### Drop rows with NA's and dropna( )

In [10]:
# reimport the nba file as needed
nba = pd.read_csv('../datasets/nba.csv')

In [12]:
# preview the last row with all missing values
nba.head(3) # index 2 has a null in salary
nba.tail(1) # entire row is null

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


#### Using dropna will remove ANY rows within the dataframe that have a null value. This would mean that for example, row 2 is removed becaseu it is only missing salary but also the last row with no data at all is removed.

Parameters:

nba.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

- axis can be = 0 or 'index', 1 or 'columns'
    * 0, or 'index' : Drop rows which contain missing values.
    * 1, or 'columns' : Drop columns which contain missing value.
    
- how can be: {'any', 'all'}, default 'any'
    * 'any' : If any NA values are present, drop that row or column.
    * 'all' : If all values are NA, drop that row or column.
    
- inplace can be True to permanently delete the nulls 

- subset : array-like, optional
    * Labels along other axis to consider, e.g. if you are dropping rows
    * these would be a list of columns to include


In [14]:
# the df shows row 2 is now gone as is the last row and any other null values
nba.dropna()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [16]:
# subset to remove a row within a column that is null
nba.head(3)

# this wil remov only rows with a missing value in Salary, notice row 2 is gone now, to make perm, reinit a var or use inplace=T
nba.dropna(subset = ['Salary'])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


### Fill in null values with the fillna( ) method.

Fill NA's with a specified value.
- Works fine on a consistent numeric dataset but using it will prodcue a 0 (if chosen) for string columns.

In [17]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [19]:
# example of issue using 0, college (and the last row) now has 0 for missing values which dosent make sense for string vals.
nba.fillna(0)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,0,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,0,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


#### To solve for this, call a series and apply the fillna method. Make permanent with inplace = True

In [23]:
# now only salary is filled with 0's in the data drame.

nba['Salary'].fillna(0, inplace=True)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [25]:
# fill the string nulls with a new string value
nba['College'].fillna("No College", inplace=True)

nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,No College,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,No College,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [35]:
# fill multiple cols with same data type
nba[['Age', 'Height', 'Weight']] = nba[['Age', 'Height', 'Weight']].fillna(0)

In [36]:
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,No College,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,No College,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


### The .astype( ) method, convert datatypes on a series from one to another.

- Null values must be removed from the enitre row as applicable - 

In [46]:
# import data set again for a freash start, note we are dropping the row with nulls when reading in
nba = pd.read_csv('../datasets/nba.csv').dropna(how='all') 

nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [47]:
# fill salary with 0 and college with "no college"
nba['Salary'].fillna(0, inplace = True)
nba['College'].fillna("No College", inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0


In [50]:
# verify with info()
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   457 non-null    object 
 8   Salary    457 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [51]:
# check out the dtypes - only objects and floats-remember, original numerics are ints but bc of nulls, pandas converts to float.
# becasue the last row was null across the row, all numerics were imported as floats and we can convert back to int's.
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [53]:
# convert salary back to an int, NOTE: astype does not have an inplce paramter so it needs reassigned!!
nba['Salary'] = nba['Salary'].astype('int')

nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0


In [54]:
# convert age and number to int's
nba['Age'] = nba['Age'].astype('int')
nba['Number'] = nba['Number'].astype('int')

nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99,SF,25,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30,SG,27,6-5,205.0,Boston University,0


In [55]:
# to decrease memory useage, convert a col like Position to a category and run nba.info() again to see memory decrease
nba['Position'] = nba['Position'].astype('category')

In [58]:
# from 35.7 to 27.4!!!
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    int32   
 3   Position  457 non-null    category
 4   Age       457 non-null    int32   
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   457 non-null    object  
 8   Salary    457 non-null    int32   
dtypes: category(2), float64(1), int32(3), object(3)
memory usage: 25.6+ KB


In [57]:
# Convert team to a category and run info() - 25.6 from 27.4
nba['Team'] = nba['Team'].astype('category')