In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read from saved file
df = pd.read_csv(r"C:\Users\jaroo\OneDrive\Documents\netflix_titles.csv") # <change the file path>
df.head()

### Checking data

In [None]:
# Check the row count
df.shape

In [None]:
# To see high level data details
df.info()

In [None]:
# checking for null values
df.isnull().sum().sort_values(ascending=False)

### Handling Manual input error

In [None]:
# Findout manual input error for duration
df[df['duration'].isnull()]

In [None]:
# Correcting manual input by replacing rating to duration
df.loc[df['director']=='Louis C.K.','duration'] =df['rating']
df.loc[df['director']=='Louis C.K.','rating'] ='Unknown'
df[df['director']=='Louis C.K.']

### Handling null values

In [None]:
# Checking the percentage of null values overall
round(df.isnull().sum()/df.shape[0]*100,2).sort_values(ascending=False)

In [None]:
# dropping rows fpr small percentages if nulls 
df.dropna(subset=['rating','date_added'],axis=0, inplace=True)
df.shape

In [None]:
round(df.isnull().sum()/df.shape[0]*100,2).sort_values(ascending=False)

In [None]:
# Replace nan values with appropriate values
df['country'].replace(np.nan,'United States',inplace = True)
df['director'].replace(np.nan, 'Unknown', inplace = True)
df['cast'].replace(np.nan,'Unknown', inplace = True)
df.isnull().sum().sort_values(ascending=False)

### Handling duplicates data

In [None]:
# Dropping duplicates
df.drop_duplicates(inplace=True)
df.shape

### Changing column type by cleaning column data

In [None]:
# Removing characters from duration
df.duration=df.duration.apply(lambda x: x.replace (' min', '') if 'min' in x else x)
df.duration=df.duration.apply(lambda x: x.replace (' Season', '') if 'Season' in x else x)
df.duration=df.duration.apply(lambda x: x.replace ('s', '') if 's' in x else x)
df.head()

### Adding columns

In [None]:
# Add new columns for time analysis
df['date_added_year']=pd.DatetimeIndex(df['date_added']).year
df['date_added_month']=pd.DatetimeIndex(df['date_added']).month

### Changing to Data Types

In [None]:
# Correcting data types
df.loc[:,['duration']]=df.loc[:,['duration']].apply(lambda x: x.astype('int64'))
df['date_added'] = pd.to_datetime(df['date_added'])

In [None]:
df.info()

In [None]:
df.describe()

### Creating subset data

In [None]:
# # Creating a new list of directors with showid
director_s = df[['show_id','director']]
director_s = (director_s.drop('director', axis=1)
        .join (director_s.director.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('director')))
print(director_s)

In [None]:
# Creating a new list of cast with showid
cast_s = df[['show_id','cast']]
cast_s = (cast_s.drop('cast', axis=1)
        .join (cast_s.cast.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('cast')))
print(cast_s)

In [None]:
# Creating a new list of genre with showid
genre_s = df[['show_id','listed_in']]
genre_s = (genre_s.drop('listed_in', axis=1)
        .join (genre_s.listed_in.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('listed_in')))
print(genre_s)

In [None]:
# Creating a new list of country with showid
country_s =pd.DataFrame()
country_s = df[['show_id','country']]
country_s['country'].replace(", South Korea","South Korea",inplace = True)
country_s['country'].replace(", France, Algeria","France, Algeria",inplace = True)
country_s['country'].replace("United Kingdom,","United Kingdom",inplace = True)
country_s['country'].replace("France, Belgium, Luxembourg, Cambodia,","France, Belgium, Luxembourg, Cambodia",inplace = True)
country_s['country'].replace("United States,","United States",inplace = True)
country_s['country'].replace("Poland,","Poland",inplace = True)
country_s = (country_s.drop('country', axis=1)
        .join (country_s.country.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('country')))
print(country_s)

### Python script to load data into PBI

In [None]:
import pandas as pd
import numpy as np

# Read from saved file
df = pd.read_csv(r"C:\Users\jaroo\OneDrive\Documents\netflix_titles.csv") # <change the file path>

# Correcting manual input by replacing rating to duration
df.loc[df['director']=='Louis C.K.','duration'] =df['rating']
df.loc[df['director']=='Louis C.K.','rating'] ='Unknown'

# dropping rows fpr small percentages if nulls 
df.dropna(subset=['rating','date_added'],axis=0, inplace=True)

# Replace nan values with appropriate values
df['country'].replace(np.nan,'United States',inplace = True)
df['director'].replace(np.nan, 'Unknown', inplace = True)
df['cast'].replace(np.nan,'Unknown', inplace = True)

# Dropping duplicates
df.drop_duplicates(inplace=True)

# Removing characters from duration
df.duration=df.duration.apply(lambda x: x.replace (' min', '') if 'min' in x else x)
df.duration=df.duration.apply(lambda x: x.replace (' Season', '') if 'Season' in x else x)
df.duration=df.duration.apply(lambda x: x.replace ('s', '') if 's' in x else x)

# Correcting data types
df.loc[:,['duration']]=df.loc[:,['duration']].apply(lambda x: x.astype('int64'))
df['date_added'] = pd.to_datetime(df['date_added'])

# Add new columns for time analysis
df['date_added_year']=pd.DatetimeIndex(df['date_added']).year
df['date_added_month']=pd.DatetimeIndex(df['date_added']).month



# Mapping tables

# Creating a new list of directors with showid
director_s = df[['show_id','director']]
director_s = (director_s.drop('director', axis=1)
        .join (director_s.director.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('director')))

# Creating a new list of cast with showid
cast_s = df[['show_id','cast']]
cast_s = (cast_s.drop('cast', axis=1)
        .join (cast_s.cast.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('cast')))

# Creating a new list of genre with showid
genre_s = df[['show_id','listed_in']]
genre_s = (genre_s.drop('listed_in', axis=1)
        .join (genre_s.listed_in.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('listed_in')))

# Creating a new list of country with showid
country_s =pd.DataFrame()
country_s = df[['show_id','country']]
country_s['country'].replace(", South Korea","South Korea",inplace = True)
country_s['country'].replace(", France, Algeria","France, Algeria",inplace = True)
country_s['country'].replace("United Kingdom,","United Kingdom",inplace = True)
country_s['country'].replace("France, Belgium, Luxembourg, Cambodia,","France, Belgium, Luxembourg, Cambodia",inplace = True)
country_s['country'].replace("United States,","United States",inplace = True)
country_s['country'].replace("Poland,","Poland",inplace = True)
country_s = (country_s.drop('country', axis=1)
        .join (country_s.country.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('country')))


### Validate Data

In [None]:
# Content Released By Year
df1 = df.query('release_year >=2007')
df1 = df1.groupby ('release_year') ['show_id'].count().reset_index()
df1 = df1.sort_values(by =['release_year'], ascending =False)
print(df1)

In [None]:
# Content Released By Year/Type
df2 = df.query('release_year >=2007')
df2 = df2.groupby (['type','release_year']) ['show_id'].count().reset_index()
df2 = df2.sort_values(by =['release_year'], ascending =False)
print(df2)

In [None]:
# Contents By Rating
df3 = df.rating.value_counts()
print(df3)

In [None]:
# Contents By Rating/Type
df4=df
df4 = df4.groupby (['type','rating']) ['show_id'].count().reset_index()
df4 = df4.sort_values(by =['show_id'], ascending =False)
print(df4)

In [None]:
# Top 5 Directors publish more contents
ddr = df[['show_id','director','type']]
ddr = (ddr.drop('director', axis=1)
        .join (ddr.director.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('director')))
df22 = ddr[['show_id','type','director']]
df22 = df22.groupby (['type','director']) ['show_id'].count().reset_index().sort_values('show_id')
# df22 = df22.to_frame()
df22 = df22[df22.director != 'Unknown']
df22 = df22.sort_values(by =['show_id'], ascending =False)
df22 = df22.head(5)
print(df22)

In [None]:
# Top 5 Directors publish more contents/type
ddr = df[['show_id','director','type']]
ddr = (ddr.drop('director', axis=1)
        .join (ddr.director.str.split(', ', expand = True).stack().reset_index(drop=True,level=1).rename('director')))
df22 = ddr[['show_id','type','director']]
df22 = df22.groupby (['type','director']) ['show_id'].count().reset_index().sort_values('show_id')
# df22 = df22.to_frame()
df22 = df22[df22.director != 'Unknown']
df22 = df22[df22.type == 'TV Show']
df22 = df22.sort_values(by =['show_id'], ascending =False)
df22 = df22.head(5)
print(df22)