# Real World Data Cleaning in Python, Pandas
Dataset used is `youtube-top-100-songs-2025.csv`

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [76]:
df= pd.read_csv('data/youtube-top-100-songs-2025.csv')
print(df.shape)
print(len(df))
print(df.columns.tolist())


(100, 13)
100
['title', 'fulltitle', 'description', 'view_count', 'categories', 'tags', 'duration', 'duration_string', 'live_status', 'thumbnail', 'channel', 'channel_url', 'channel_follower_count']


In [77]:
# rename some columns
df= df.rename(columns={'channel':'channel_name','fulltitle':'full_title'})
df.columns.tolist()

['title',
 'full_title',
 'description',
 'view_count',
 'categories',
 'tags',
 'duration',
 'duration_string',
 'live_status',
 'thumbnail',
 'channel_name',
 'channel_url',
 'channel_follower_count']

In [78]:
# check null values in any columns
df.isnull().any()


title                     False
full_title                False
description               False
view_count                False
categories                False
tags                       True
duration                  False
duration_string           False
live_status               False
thumbnail                 False
channel_name              False
channel_url               False
channel_follower_count    False
dtype: bool

In [79]:
df[df['tags'].isna()==1] # df filtered to show which entries contain null values under the tag column

Unnamed: 0,title,full_title,description,view_count,categories,tags,duration,duration_string,live_status,thumbnail,channel_name,channel_url,channel_follower_count
10,Zillionaire Doe - How It's Going (Official Video),Zillionaire Doe - How It's Going (Official Video),"Stream ""How It's Going"": https://zillionairedo...",1198540,People & Blogs,,152,2:32,False,https://i.ytimg.com/vi/PDcDcSTiL4g/maxresdefau...,Zillionaire Doe,https://www.youtube.com/channel/UCbVdo2e1hDYR9...,142000
11,Alex Warren - Ordinary (Official Video),Alex Warren - Ordinary (Official Video),"'You'll Be Alright, Kid' out now: https://alex...",147871443,People & Blogs,,187,3:07,False,https://i.ytimg.com/vi_webp/u2ah9tWTkmk/maxres...,Alex Warren,https://www.youtube.com/channel/UCX2Pm1JoWF3ch...,4050000
12,Kendrick Lamar - luther (Official Audio),Kendrick Lamar - luther (Official Audio),Kendrick Lamar “GNX” is available now: https:/...,142084039,People & Blogs,,178,2:58,False,https://i.ytimg.com/vi/HfWLgELllZs/maxresdefau...,Kendrick Lamar,https://www.youtube.com/channel/UC3lBXcrKFnFAF...,19700000
22,Tommy Richman - MILLION DOLLAR BABY (Official ...,Tommy Richman - MILLION DOLLAR BABY (Official ...,"Stream ""MILLION DOLLAR BABY"" OUT NOW: https://...",45822601,Music,,156,2:36,False,https://i.ytimg.com/vi_webp/bUX8MDNQda4/maxres...,Tommy Richman,https://www.youtube.com/channel/UC7eumlwt8zpmN...,655000
25,"Adam Port, Stryv - Move feat. Malachiii (Exten...","Adam Port, Stryv - Move feat. Malachiii (Exten...","Adam Port, Stryv - Move (Extended Version) fea...",87005996,Music,,353,5:53,False,https://i.ytimg.com/vi/95dB-ObZ7Ho/maxresdefau...,keinemusik,https://www.youtube.com/channel/UCYzx8QoAiRb69...,255000
26,Tutipsy & Niickii - Afro Disco (Visualizer),Tutipsy & Niickii - Afro Disco (Visualizer),New Single: “Afro Disco\nListen Now: https://s...,17322,Music,,154,2:34,False,https://i.ytimg.com/vi_webp/dtJg6bjrUuo/maxres...,NIICKII,https://www.youtube.com/channel/UChcyluLsfj0ge...,47
31,NIGHT SKIES (Official Music Video),NIGHT SKIES (Official Music Video),NIGHT SKIES (Official Music Video)\n\n\nArtist...,6605,Music,,137,2:17,False,https://i.ytimg.com/vi/c4Zxc2gqfo8/sddefault.jpg,Chxnny,https://www.youtube.com/channel/UCcA274LONuELr...,3630
32,SEVDALIZA - ALIBI FT. PABLLO VITTAR & YSEULT (...,SEVDALIZA - ALIBI FT. PABLLO VITTAR & YSEULT (...,Stream Alibi: https://lnkfi.re/ALIBI_\n\nWritt...,236500879,Music,,193,3:13,False,https://i.ytimg.com/vi_webp/qVqFuokjRMc/maxres...,Sevdaliza,https://www.youtube.com/channel/UCONnf7g7QfOqz...,1230000
35,squabble up,squabble up,Kendrick Lamar “GNX” is available now: https:/...,70965585,People & Blogs,,166,2:46,False,https://i.ytimg.com/vi_webp/fuV4yQWdn_4/maxres...,Kendrick Lamar,https://www.youtube.com/channel/UC3lBXcrKFnFAF...,19700000
44,David Guetta & Sia - Beautiful People (Officia...,David Guetta & Sia - Beautiful People (Officia...,"Add ""Beautiful People” by David Guetta & Sia t...",41075369,Music,,204,3:24,False,https://i.ytimg.com/vi/S2fSojJqyNY/maxresdefau...,David Guetta,https://www.youtube.com/channel/UC1l7wYrva1qCH...,27600000


In [80]:
# ways to fix missing values
df['tags']=df['tags'].fillna(0)

df[df['title']=="Zillionaire Doe - How It's Going (Official Video)"] # tags assigned to 0

Unnamed: 0,title,full_title,description,view_count,categories,tags,duration,duration_string,live_status,thumbnail,channel_name,channel_url,channel_follower_count
10,Zillionaire Doe - How It's Going (Official Video),Zillionaire Doe - How It's Going (Official Video),"Stream ""How It's Going"": https://zillionairedo...",1198540,People & Blogs,0,152,2:32,False,https://i.ytimg.com/vi/PDcDcSTiL4g/maxresdefau...,Zillionaire Doe,https://www.youtube.com/channel/UCbVdo2e1hDYR9...,142000


In [81]:
# check duplicates
df.duplicated() # it does not really show the full dataset so use below

df[df.duplicated()==1] # fitler for only duplicates

Unnamed: 0,title,full_title,description,view_count,categories,tags,duration,duration_string,live_status,thumbnail,channel_name,channel_url,channel_follower_count


In [82]:
# get rid of official video part in  title
df['title'] = df['title'].str.replace('(Official Video)', '')

df.head()

Unnamed: 0,title,full_title,description,view_count,categories,tags,duration,duration_string,live_status,thumbnail,channel_name,channel_url,channel_follower_count
0,ROSÉ & Bruno Mars - APT. (Official Music Video),ROSÉ & Bruno Mars - APT. (Official Music Video),ROSÉ & Bruno Mars - APT.\nDownload/stream: ht...,2009014557,Music,YG Entertainment;YG;와이지;K-pop;BLACKPINK;블랙핑크;블...,173,2:53,False,https://i.ytimg.com/vi_webp/ekr2nIex040/maxres...,ROSÉ,https://www.youtube.com/channel/UCBo1hnzxV9rz3...,19200000
1,"Lady Gaga, Bruno Mars - Die With A Smile (Offi...","Lady Gaga, Bruno Mars - Die With A Smile (Offi...",MAYHEM OUT NOW\nhttp://ladygaga.com \n \nListe...,1324833300,Music,Lady Gaga;Bruno Mars;Interscope;Pop,252,4:12,False,https://i.ytimg.com/vi/kPa7bsKwL-c/maxresdefau...,Lady Gaga,https://www.youtube.com/channel/UC07Kxew-cMIay...,29600000
2,Reneé Rapp - Leave Me Alone (Official Music Vi...,Reneé Rapp - Leave Me Alone (Official Music Vi...,"Listen to “BITE ME”, the new album from Reneé ...",2536628,Music,Reneé Rapp;Interscope Records;Pop,160,2:40,False,https://i.ytimg.com/vi/tiPWzFLiz4A/maxresdefau...,Reneé Rapp,https://www.youtube.com/channel/UCZy4ki_L4bzw9...,408000
3,Billie Eilish - BIRDS OF A FEATHER (Official M...,Billie Eilish - BIRDS OF A FEATHER (Official M...,Listen to HIT ME HARD AND SOFT: https://billie...,558329099,Music,Billie Eilish;Darkroom/Interscope Records;Alte...,231,3:51,False,https://i.ytimg.com/vi/V9PVRfjEBTI/maxresdefau...,Billie Eilish,https://www.youtube.com/channel/UCDGmojLIoWpXo...,56800000
4,Reneé Rapp - Mad (Official Music Video),Reneé Rapp - Mad (Official Music Video),"Listen to “BITE ME”, the new album from Reneé ...",2113548,Music,Reneé Rapp;Interscope Records;Pop,180,3:00,False,https://i.ytimg.com/vi/xkWQM3flsiY/maxresdefau...,Reneé Rapp,https://www.youtube.com/channel/UCZy4ki_L4bzw9...,408000


In [83]:
# get rid of all strings that we don't want
bad_strings = ['()','(Official Video)', 'Official Music Video', '(Official Audio)']

for b in bad_strings:
    df['title'] = df['title'].str.replace(b, '', regex=False)
df['title']

0                          ROSÉ & Bruno Mars - APT. ()
1           Lady Gaga, Bruno Mars - Die With A Smile ()
2                        Reneé Rapp - Leave Me Alone ()
3                 Billie Eilish - BIRDS OF A FEATHER ()
4                                   Reneé Rapp - Mad ()
                            ...                        
95    Ariana Grande - twilight zone (Official Lyric ...
96             Gracie Abrams - I Love You, I’m Sorry ()
97              HoodTrophy Bino ft. FCG Heem - Miami ()
98                 Sean Paul x INNA - Let It Talk To Me
99    Selena Gomez, benny blanco - How Does It Feel ...
Name: title, Length: 100, dtype: object

In [84]:
# drop thumbnail col
df=df.drop(['thumbnail','channel_url'], axis=1)

In [85]:
# split up the full title into music name and (official.... ) part then create a new column for the second part
df['full_title'].str.split(pat='(')
df['Extra']=df['full_title'].str.split(pat='(').str[1]
df['Extra']

0     Official Music Video)
1     Official Music Video)
2     Official Music Video)
3     Official Music Video)
4     Official Music Video)
              ...          
95    Official Lyric Video)
96    Official Music Video)
97    Official Music Video)
98                      NaN
99    Official Lyric Video)
Name: Extra, Length: 100, dtype: object

In [86]:
df['Extra']=df['Extra'].str.split(pat=')').str[0]
df['Extra']

0     Official Music Video
1     Official Music Video
2     Official Music Video
3     Official Music Video
4     Official Music Video
              ...         
95    Official Lyric Video
96    Official Music Video
97    Official Music Video
98                     NaN
99    Official Lyric Video
Name: Extra, Length: 100, dtype: object

In [87]:
df['full_title']=df['full_title'].str.split(pat='(').str[0]
df['full_title']

0                            ROSÉ & Bruno Mars - APT. 
1             Lady Gaga, Bruno Mars - Die With A Smile 
2                          Reneé Rapp - Leave Me Alone 
3                   Billie Eilish - BIRDS OF A FEATHER 
4                                     Reneé Rapp - Mad 
                            ...                        
95                       Ariana Grande - twilight zone 
96               Gracie Abrams - I Love You, I’m Sorry 
97                HoodTrophy Bino ft. FCG Heem - Miami 
98                 Sean Paul x INNA - Let It Talk To Me
99    Selena Gomez, benny blanco - How Does It Feel ...
Name: full_title, Length: 100, dtype: object

In [88]:
df.isnull().any() # columns that have null values
df[df['Extra'].isnull()==1]
# handle missing values
df['Extra']=df['Extra'].fillna(0)

In [89]:
# check datatypes
df.dtypes

title                     object
full_title                object
description               object
view_count                 int64
categories                object
tags                      object
duration                   int64
duration_string           object
live_status                 bool
channel_name              object
channel_follower_count     int64
Extra                     object
dtype: object

In [90]:
# chaning datatypes
# df=df.astype({'column_name':'data_type'})