## Data Cleaning Notebook

##### Import Libraries

In [60]:
import pandas as pd

##### Import Dataset (Scraped Data)

In [61]:
data_directory = "..\\data\\scraper_data\\playlist_output.csv"
df = pd.read_csv(data_directory)
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2246 entries, 0 to 2245
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   season          2246 non-null   int64  
 1   title           2246 non-null   object 
 2   episode_number  0 non-null      float64
 3   episode_name    0 non-null      float64
 4   date            0 non-null      float64
 5   length          2246 non-null   object 
 6   url             2246 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 123.0+ KB


Unnamed: 0,season,title,episode_number,episode_name,date,length,url
0,1,DRŽAVNI POSAO [HQ] - Ep.1: Rakija (24.09.2012.),,,,5:50,https://www.youtube.com/watch?v=McwPB-eQ2BY&li...
1,1,DRŽAVNI POSAO [HQ] - Ep.2: Pilići (25.09.2012.),,,,5:35,https://www.youtube.com/watch?v=hI2t9a4ED04&li...
2,1,DRŽAVNI POSAO [HQ] - Ep.3: Biti slobodan (26.0...,,,,4:35,https://www.youtube.com/watch?v=0YlkzhkLXPg&li...


##### Extract Episode Number

In [62]:
df['episode_number'] = df['title'].str.extract(r'Ep\.\s*(.*?)\s*:')
df.head(3)

Unnamed: 0,season,title,episode_number,episode_name,date,length,url
0,1,DRŽAVNI POSAO [HQ] - Ep.1: Rakija (24.09.2012.),1,,,5:50,https://www.youtube.com/watch?v=McwPB-eQ2BY&li...
1,1,DRŽAVNI POSAO [HQ] - Ep.2: Pilići (25.09.2012.),2,,,5:35,https://www.youtube.com/watch?v=hI2t9a4ED04&li...
2,1,DRŽAVNI POSAO [HQ] - Ep.3: Biti slobodan (26.0...,3,,,4:35,https://www.youtube.com/watch?v=0YlkzhkLXPg&li...


##### Extract Episode Name

In [63]:
df['episode_name'] = df['title'].str.extract(r':\s*(.*?)\s*\(')
df.head(3)

Unnamed: 0,season,title,episode_number,episode_name,date,length,url
0,1,DRŽAVNI POSAO [HQ] - Ep.1: Rakija (24.09.2012.),1,Rakija,,5:50,https://www.youtube.com/watch?v=McwPB-eQ2BY&li...
1,1,DRŽAVNI POSAO [HQ] - Ep.2: Pilići (25.09.2012.),2,Pilići,,5:35,https://www.youtube.com/watch?v=hI2t9a4ED04&li...
2,1,DRŽAVNI POSAO [HQ] - Ep.3: Biti slobodan (26.0...,3,Biti slobodan,,4:35,https://www.youtube.com/watch?v=0YlkzhkLXPg&li...


##### Extract Episode Release Date

In [64]:
df['date'] = df['title'].str.extract(r'\(\s*(.*?)\s*\)')
df.head(3)

Unnamed: 0,season,title,episode_number,episode_name,date,length,url
0,1,DRŽAVNI POSAO [HQ] - Ep.1: Rakija (24.09.2012.),1,Rakija,24.09.2012.,5:50,https://www.youtube.com/watch?v=McwPB-eQ2BY&li...
1,1,DRŽAVNI POSAO [HQ] - Ep.2: Pilići (25.09.2012.),2,Pilići,25.09.2012.,5:35,https://www.youtube.com/watch?v=hI2t9a4ED04&li...
2,1,DRŽAVNI POSAO [HQ] - Ep.3: Biti slobodan (26.0...,3,Biti slobodan,26.09.2012.,4:35,https://www.youtube.com/watch?v=0YlkzhkLXPg&li...


##### Make Episodes Dataframe

In [65]:
episode_keys = 'Ep.'
episodes_df = df[df['title'].str.contains(episode_keys)]
episodes_df.shape[0]
episodes_df.head(3)

Unnamed: 0,season,title,episode_number,episode_name,date,length,url
0,1,DRŽAVNI POSAO [HQ] - Ep.1: Rakija (24.09.2012.),1,Rakija,24.09.2012.,5:50,https://www.youtube.com/watch?v=McwPB-eQ2BY&li...
1,1,DRŽAVNI POSAO [HQ] - Ep.2: Pilići (25.09.2012.),2,Pilići,25.09.2012.,5:35,https://www.youtube.com/watch?v=hI2t9a4ED04&li...
2,1,DRŽAVNI POSAO [HQ] - Ep.3: Biti slobodan (26.0...,3,Biti slobodan,26.09.2012.,4:35,https://www.youtube.com/watch?v=0YlkzhkLXPg&li...


##### Make Bonus Content Dataframe

In [68]:
bonus_content_df = df[df["title"].str.contains(episode_keys) == False]
bonus_content_df.shape[0]
bonus_content_df.head(3)


Unnamed: 0,season,title,episode_number,episode_name,date,length,url
70,1,"DRŽAVNI POSAO [HQ] - Novogodišnji specijal, 1....",,,31.12.2012.,7:06,https://www.youtube.com/watch?v=8WHXAN-KXwo&li...
71,1,"DRŽAVNI POSAO [HQ] - Novogodišnji specijal, 2....",,,31.12.2012.,6:45,https://www.youtube.com/watch?v=35uRGPEoPf8&li...
72,1,"DRŽAVNI POSAO [HQ] - Novogodišnji specijal, 3....",,,31.12.2012.,5:38,https://www.youtube.com/watch?v=_GPrsTkpuaw&li...
