In [82]:
import numpy as np
import pandas as pd

# this chapter in not in JDV's book, but feels important
# for writing clean and readable code!

In [83]:
spotify = pd.read_csv('data/spotify_songs.csv')
spotify.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [84]:
# spotify.dropna(inplace=True)
spotify.dtypes

track_id                     object
track_name                   object
track_artist                 object
track_popularity              int64
track_album_id               object
track_album_name             object
track_album_release_date     object
playlist_name                object
playlist_id                  object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
dtype: object

In [85]:
# lets grab some of these columns and narrow this down

cols = ['playlist_genre', 'track_id', 'track_name', 'track_popularity', 
        'danceability', 'energy', 'duration_ms']

spotify[cols].dtypes

track_id             object
track_name           object
track_popularity      int64
danceability        float64
energy              float64
duration_ms           int64
dtype: object

In [86]:
# let's look at some memory usage

spotify[cols].memory_usage(deep=True)

Index                   132
track_id            2593807
track_name          2480602
track_popularity     262664
danceability         262664
energy               262664
duration_ms          262664
dtype: int64

In [87]:
# total mem usage

spotify[cols].memory_usage(deep=True).sum()

6125197

In [88]:
# introducing chaining!

(spotify
    [cols]
    .select_dtypes(int)
    .describe()
)

Unnamed: 0,track_popularity,duration_ms
count,32833.0,32833.0
mean,42.477081,225799.811622
std,24.984074,59834.006182
min,0.0,4000.0
25%,24.0,187819.0
50%,45.0,216000.0
75%,62.0,253585.0
max,100.0,517810.0


In [89]:
# does track_popularity need to be an int64? probably not...

# lets ask numpy what the ranges are for int8
np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [90]:
np.iinfo(np.int16)

iinfo(min=-32768, max=32767, dtype=int16)

In [91]:
(spotify
    [cols]                                  # select cols
    .astype({'track_popularity':'int8'})    # astype -> dict -> int8
    .select_dtypes('int')                # this is just to reselect by dtype
    .describe()                             # and to display some results here
)

Unnamed: 0,duration_ms
count,32833.0
mean,225799.811622
std,59834.006182
min,4000.0
25%,187819.0
50%,216000.0
75%,253585.0
max,517810.0


In [92]:
# i think we can convert energy and danceability to a float32
# it won't save a ton of memory but it's something!

np.finfo(np.float32)

finfo(resolution=1e-06, min=-3.4028235e+38, max=3.4028235e+38, dtype=float32)

In [93]:
# yeah, float32 should work

(spotify
    [cols]
    .select_dtypes(float)
    .describe()
    )

Unnamed: 0,danceability,energy
count,32833.0,32833.0
mean,0.65485,0.698619
std,0.145085,0.18091
min,0.0,0.000175
25%,0.563,0.581
50%,0.672,0.721
75%,0.761,0.84
max,0.983,1.0


In [94]:
(spotify
    [cols]
    .astype({'danceability':'float32', 'energy':'float32'})
    .select_dtypes(float)
    .describe()
)

# looks clean, and data looks identical

Unnamed: 0,danceability,energy
count,32833.0,32833.0
mean,0.65485,0.698619
std,0.145086,0.180909
min,0.0,0.000175
25%,0.563,0.581
50%,0.672,0.721
75%,0.761,0.84
max,0.983,1.0


In [95]:
spotify[cols].dtypes

track_id             object
track_name           object
track_popularity      int64
danceability        float64
energy              float64
duration_ms           int64
dtype: object

In [96]:
# searching for NaN
# found some

%time spotify[spotify['track_name'].isnull()]

CPU times: user 858 µs, sys: 249 µs, total: 1.11 ms
Wall time: 887 µs


Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
8151,69gRFGOWY9OMpFJgFol1u0,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,HIP&HOP,5DyJsJZOpMJh34WvUrQzMV,rap,...,6,-7.635,1,0.176,0.041,0.0,0.116,0.649,95.999,282707
9282,5cjecvX0CmC9gK0Laf5EMQ,,,0,3luHJEPw434tvNbme3SP8M,,2017-12-01,GANGSTA Rap,5GA8GDo7RQC3JEanT81B3g,rap,...,11,-5.364,0,0.319,0.0534,0.0,0.553,0.191,146.153,202235
9283,5TTzhRSWQS4Yu8xTgAuq6D,,,0,3luHJEPw434tvNbme3SP8M,,2017-12-01,GANGSTA Rap,5GA8GDo7RQC3JEanT81B3g,rap,...,10,-5.907,0,0.307,0.0963,0.0,0.0888,0.505,86.839,206465
19568,3VKFip3OdAvv4OfNTgFWeQ,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,Reggaeton viejito🔥,0si5tw70PIgPkY1Eva6V8f,latin,...,11,-6.075,0,0.0366,0.0606,0.00653,0.103,0.726,97.017,252773
19811,69gRFGOWY9OMpFJgFol1u0,,,0,717UG2du6utFe7CdmpuUe3,,2012-01-05,latin hip hop,3nH8aytdqNeRbcRCg3dw9q,latin,...,6,-7.635,1,0.176,0.041,0.0,0.116,0.649,95.999,282707


In [97]:
# same deal, just curious which one is faster

%time (spotify[cols].query('track_name.isna()'))

CPU times: user 2.17 ms, sys: 924 µs, total: 3.1 ms
Wall time: 2.29 ms


Unnamed: 0,track_id,track_name,track_popularity,danceability,energy,duration_ms
8151,69gRFGOWY9OMpFJgFol1u0,,0,0.714,0.821,282707
9282,5cjecvX0CmC9gK0Laf5EMQ,,0,0.678,0.659,202235
9283,5TTzhRSWQS4Yu8xTgAuq6D,,0,0.465,0.82,206465
19568,3VKFip3OdAvv4OfNTgFWeQ,,0,0.675,0.919,252773
19811,69gRFGOWY9OMpFJgFol1u0,,0,0.714,0.821,282707


In [98]:
(spotify
    [cols]
    .assign(track_name=spotify.track_name.fillna('hello').astype('object'))
)

Unnamed: 0,track_id,track_name,track_popularity,danceability,energy,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,66,0.748,0.916,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,67,0.726,0.815,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,70,0.675,0.931,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,60,0.718,0.930,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,69,0.650,0.833,189052
...,...,...,...,...,...,...
32828,7bxnKAamR3snQ1VGLuVfC1,City Of Lights - Official Radio Edit,42,0.428,0.922,204375
32829,5Aevni09Em4575077nkWHz,Closer - Sultan & Ned Shepard Remix,20,0.522,0.786,353120
32830,7ImMqPP3Q1yfUHvsdn7wEo,Sweet Surrender - Radio Edit,14,0.529,0.821,210112
32831,2m69mhnfQ1Oq6lGtXuYhgX,Only For You - Maor Levi Remix,15,0.626,0.888,367432


In [99]:
# this is really useful for finding if something is categorical or more "cardinal"
# in this case, it's obvs pretty categorical

(spotify
    ['playlist_genre']
    .value_counts(dropna=False)
    )

playlist_genre
edm      6043
rap      5746
pop      5507
r&b      5431
latin    5155
rock     4951
Name: count, dtype: int64

In [100]:
spotify[cols].memory_usage(deep=True).sum()

6125197

In [101]:
# we can convert this to a categorical type

(spotify
    [cols]
    .astype({'playlist_genre':'category'})
    # .memory_usage(deep=True)
    # .sum()
    )

KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'playlist_genre' not found in columns."