In [1]:
import pandas as pd

## Criar Dataframe com base no dataset

In [7]:
# Carregar dataframe de top 10 diários da netflix
df_netflix = pd.read_csv('dataset/netflix_daily_top_10.csv')
df_netflix

Unnamed: 0,As of,Rank,Year to Date Rank,Last Week Rank,Title,Type,Netflix Exclusive,Netflix Release Date,Days In Top 10,Viewership Score
0,2020-04-01,1,1,1,"Tiger King: Murder, Mayhem …",TV Show,Yes,"Mar 20, 2020",9,90
1,2020-04-01,2,2,-,Ozark,TV Show,Yes,"Jul 21, 2017",5,45
2,2020-04-01,3,3,2,All American,TV Show,,"Mar 28, 2019",9,76
3,2020-04-01,4,4,-,Blood Father,Movie,,"Mar 26, 2020",5,30
4,2020-04-01,5,5,4,The Platform,Movie,Yes,"Mar 20, 2020",9,55
...,...,...,...,...,...,...,...,...,...,...
7095,2022-03-11,6,5,1,Worst Roommate Ever,TV Show,Yes,"Mar 1, 2022",10,81
7096,2022-03-11,7,7,2,Vikings: Valhalla,TV Show,Yes,"Feb 25, 2022",14,100
7097,2022-03-11,8,8,-,Shooter,Movie,,"Aug 1, 2014",3,7
7098,2022-03-11,9,9,7,Shrek 2,Movie,,"Mar 1, 2022",10,33


In [None]:
# Mostrar informações do dataframe
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7100 entries, 0 to 7099
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   As of                 7100 non-null   object
 1   Rank                  7100 non-null   int64 
 2   Year to Date Rank     7100 non-null   object
 3   Last Week Rank        7100 non-null   object
 4   Title                 7100 non-null   object
 5   Type                  7100 non-null   object
 6   Netflix Exclusive     4599 non-null   object
 7   Netflix Release Date  7100 non-null   object
 8   Days In Top 10        7100 non-null   int64 
 9   Viewership Score      7100 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 554.8+ KB


In [8]:
# Extrair e analisar os tipos de dados disponíveis
df_netflix.dtypes

As of                   object
Rank                     int64
Year to Date Rank       object
Last Week Rank          object
Title                   object
Type                    object
Netflix Exclusive       object
Netflix Release Date    object
Days In Top 10           int64
Viewership Score         int64
dtype: object

In [10]:
# Período da análise feita
# Converter a coluna 'As of' para datetime e obter min/max
df_netflix['As of'] = pd.to_datetime(df_netflix['As of'])
df_netflix['As of'].min(), df_netflix['As of'].max()

(Timestamp('2020-04-01 00:00:00'), Timestamp('2022-03-11 00:00:00'))

In [11]:
# Verificar valores nulos em cada coluna
df_netflix.isnull().sum()

As of                      0
Rank                       0
Year to Date Rank          0
Last Week Rank             0
Title                      0
Type                       0
Netflix Exclusive       2501
Netflix Release Date       0
Days In Top 10             0
Viewership Score           0
dtype: int64

In [13]:
# Tratar valores nulos no campo 'Netflix Exclusive'
df_netflix['Netflix Exclusive'] = df_netflix['Netflix Exclusive'].fillna('No')
df_netflix.isnull().sum()

As of                   0
Rank                    0
Year to Date Rank       0
Last Week Rank          0
Title                   0
Type                    0
Netflix Exclusive       0
Netflix Release Date    0
Days In Top 10          0
Viewership Score        0
dtype: int64

In [None]:
# Outliers e estatísticas descritivas
df_netflix.describe()


Unnamed: 0,As of,Rank,Days In Top 10,Viewership Score
count,7100,7100.0,7100.0,7100.0
mean,2021-03-21 12:00:00.000000256,5.5,24.123662,122.790141
min,2020-04-01 00:00:00,1.0,1.0,1.0
25%,2020-09-25 00:00:00,3.0,3.0,19.0
50%,2021-03-21 12:00:00,5.5,7.0,50.0
75%,2021-09-15 00:00:00,8.0,18.0,128.0
max,2022-03-11 00:00:00,10.0,428.0,1474.0
std,,2.872484,58.473789,213.861642


In [16]:
# Verificar possibilidade de transformar as colunas Year to Date Rank e Last Week Rank em inteiros
# Substituir '-' por NaN, converter para numérico, preencher NaN com 0 e então converter para int
df_netflix[['Year to Date Rank', 'Last Week Rank']] = (
	df_netflix[['Year to Date Rank', 'Last Week Rank']]
	.replace('-', pd.NA)
	.apply(pd.to_numeric, errors='coerce')
	.fillna(0)
	.astype(int)
)
df_netflix.dtypes

As of                   datetime64[ns]
Rank                             int64
Year to Date Rank                int64
Last Week Rank                   int64
Title                           object
Type                            object
Netflix Exclusive               object
Netflix Release Date            object
Days In Top 10                   int64
Viewership Score                 int64
dtype: object

In [17]:
df_netflix.head()

Unnamed: 0,As of,Rank,Year to Date Rank,Last Week Rank,Title,Type,Netflix Exclusive,Netflix Release Date,Days In Top 10,Viewership Score
0,2020-04-01,1,1,1,"Tiger King: Murder, Mayhem …",TV Show,Yes,"Mar 20, 2020",9,90
1,2020-04-01,2,2,0,Ozark,TV Show,Yes,"Jul 21, 2017",5,45
2,2020-04-01,3,3,2,All American,TV Show,No,"Mar 28, 2019",9,76
3,2020-04-01,4,4,0,Blood Father,Movie,No,"Mar 26, 2020",5,30
4,2020-04-01,5,5,4,The Platform,Movie,Yes,"Mar 20, 2020",9,55


In [20]:
import numpy as np

if not hasattr(np, 'VisibleDeprecationWarning'):
    np.VisibleDeprecationWarning = DeprecationWarning

import sweetviz as sv
sv_netflix = sv.analyze(df_netflix, target_feat='Days In Top 10')

                                             |          | [  0%]   00:00 -> (? left)

In [21]:
sv_netflix.show_html('sweetviz_netflix.html')

Report sweetviz_netflix.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
