# Introdução 
### por Iury Rosal



In [1]:
!pip install polars



In [2]:
import polars as pl

# Estruturas de Dados

## Series

In [3]:
s = pl.Series("a", [1, 2, 3])
s

a
i64
1
2
3


In [4]:
s.dtype

Int64

In [5]:
s.shape

(3,)

In [6]:
s2 = pl.Series("a", [1, 2, 3], dtype=pl.Float32)
s2

a
f32
1.0
2.0
3.0


In [7]:
import numpy as np
array_np = np.array([1, 2, 3, 4, 5])
pl.Series("array_np", array_np)

array_np
i32
1
2
3
4
5


## Dataframe

In [8]:
data = {"a": [1, 2], "b": [3, 4]}
df = pl.DataFrame(data)
df

a,b
i64,i64
1,3
2,4


In [9]:
df2 = pl.DataFrame(data, schema={"a": pl.Float32, "b": pl.Int64})
df2

a,b
f32,i64
1.0,3
2.0,4


In [10]:
df3 = pl.DataFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)])
df3

col1,col2
f32,i64
1.0,3
2.0,4


In [11]:
data = [
    pl.Series("col1", [1, 2], dtype=pl.Float32),
    pl.Series("col2", [3, 4], dtype=pl.Int64),
]
df4 = pl.DataFrame(data)
df4

col1,col2
f32,i64
1.0,3
2.0,4


In [12]:
data = np.array([(1, 2), (3, 4)], dtype=np.int64)
df5 = pl.DataFrame(data, schema=["a", "b"], orient="col")
df5

a,b
i64,i64
1,3
2,4


In [13]:
data = [[1, 2, 3], [4, 5, 6]]
df6 = pl.DataFrame(data, schema=["a", "b", "c"])
df6

a,b,c
i64,i64,i64
1,2,3
4,5,6


In [14]:
df6.dtypes

[Int64, Int64, Int64]

In [15]:
df6.shape

(2, 3)

In [16]:
print(df6)

shape: (2, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 2   ┆ 3   │
│ 4   ┆ 5   ┆ 6   │
└─────┴─────┴─────┘


In [17]:
df6.schema

{'a': Int64, 'b': Int64, 'c': Int64}

# Lendo de um arquivo CSV

In [18]:
dataframe = pl.read_csv("top-300-youtube-channels.csv")
dataframe

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,6,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,7,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,8,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,9,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,10,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


# Informações Iniciais do Dataframe

In [19]:
dataframe.head(5)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015


In [20]:
dataframe.limit(5)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015


In [21]:
dataframe.tail(5)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
295,296,"""Jake Fellman""",15000000,11618626248,440,"""Entertainment""",2012
296,297,"""DisneyJuniorUK...",11300000,11586962179,2789,"""Entertainment""",2010
297,298,"""JTBC Entertain...",7580000,11512332695,76838,"""Entertainment""",2012
298,299,"""Mnet TV""",9040000,11442069884,21617,"""Entertainment""",2013
299,300,"""GMA News""",12900000,11387663843,345736,"""News & Politic...",2007


In [22]:
dataframe.describe()

describe,column_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
str,f64,f64,str,f64,f64,f64,str,f64
"""count""",296.0,296.0,"""296""",296.0,296.0,296.0,"""296""",296.0
"""null_count""",0.0,0.0,"""0""",0.0,0.0,0.0,"""0""",0.0
"""mean""",149.760135,150.760135,,31592000.0,21996000000.0,19174.331081,,2011.368243
"""std""",86.813964,86.813964,,24714000.0,19425000000.0,46072.700205,,3.997822
"""min""",0.0,1.0,"""1theK (원더케이)""",3720000.0,11388000000.0,15.0,"""Comedy""",2005.0
"""max""",299.0,300.0,"""東海オンエア""",237000000.0,216500000000.0,345736.0,"""Sports""",2021.0
"""median""",150.5,151.5,,25700000.0,16337000000.0,1984.5,,2011.0


# Select

## Selecionando por valor na linha
No pandas, fazemos:

    dataframe[dataframe.Genre == "Education"].head()

In [23]:
dataframe.filter(
    pl.col('Genre') == "Education"
).head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
16,17,"""ChuChu TV Nurs...",61900000,42589514748,546,"""Education""",2013
18,19,"""Super Simple S...",37800000,41715976696,667,"""Education""",2006
20,21,"""Little Baby Bu...",40800000,38840192263,2287,"""Education""",2011
22,23,"""Pinkfong Baby ...",65700000,36688824621,2647,"""Education""",2011


## Selecionando colunas em específico

In [24]:
dataframe['Channel_Started'] # igual ao pandas

Channel_Started
i64
2006
2006
2006
2007
2015
2016
2007
2018
2006
2008


In [25]:
dataframe.Channel_Started # diferente do pandas

AttributeError: 'DataFrame' object has no attribute 'Channel_Started'

No pandas, fazemos:

    dataframe[dataframe.Genre == "Education"]['Channel_Started'].head()

In [26]:
dataframe.filter(
    pl.col('Genre') == "Education"
).select('Channel_Started').head()

Channel_Started
i64
2006
2013
2006
2011
2011


No pandas, fazemos:

    dataframe[['Channel_Started', 'Subscriber_Count', 'Genre']].head()

In [28]:
(dataframe
    .select(pl.col(['Channel_Name', 'Subscriber_Count', 'Genre']))
    .head(5))

Channel_Name,Subscriber_Count,Genre
str,i64,str
"""T-Series""",237000000,"""Music"""
"""Cocomelon - Nu...",154000000,"""Education"""
"""SET India""",152000000,"""Film & Animati..."
"""Sony SAB""",77500000,"""Film & Animati..."
"""✿ Kids Diana S...",108000000,"""People & Blogs..."


In [27]:
dataframe[['Channel_Started', 'Subscriber_Count', 'Genre']] # igual ao pandas

Channel_Started,Subscriber_Count,Genre
i64,i64,str
2006,237000000,"""Music"""
2006,154000000,"""Education"""
2006,152000000,"""Film & Animati..."
2007,77500000,"""Film & Animati..."
2015,108000000,"""People & Blogs..."
2016,104000000,"""People & Blogs..."
2007,93500000,"""Sports"""
2018,93900000,"""Entertainment"""
2006,58700000,"""Film & Animati..."
2008,60200000,"""Film & Animati..."


## Selecionando pelo Index

In [29]:
dataframe[:, 0]

0
1
2
3
4
5
6
7
8
9
10


In [30]:
dataframe[:, 0:3]

Unnamed: 0_level_0,Rank,Channel_Name
i64,i64,str
0,1,"""T-Series"""
1,2,"""Cocomelon - Nu..."
2,3,"""SET India"""
3,4,"""Sony SAB"""
4,5,"""✿ Kids Diana S..."
5,6,"""Like Nastya"""
6,7,"""WWE"""
7,8,"""Vlad and Niki"""
8,9,"""Movieclips"""
9,10,"""Colors TV"""


## Aprofundado filtros

No pandas, fazemos:

    dataframe[dataframe.Subscriber_Count > 10000].head()

In [31]:
(dataframe
    .filter(pl.col('Subscriber_Count') > 10000)
    .head(3)
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006


No pandas, fazemos:

    dataframe[dataframe.Subscriber_Count.between(1000000, 20000000)].head()

In [32]:
(dataframe
    .filter(pl.col('Subscriber_Count').is_between(1000000, 20000000))\
    .head(3)
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
51,52,"""Tsuriki Show""",18300000,26198962430,2299,"""Entertainment""",2019
61,62,"""Vijay Televisi...",19000000,24990083268,40319,"""Entertainment""",2007
67,68,"""shakiraVEVO""",18400000,24083534504,194,"""Music""",2009


No pandas, fazemos:

    dataframe[(dataframe.Subscriber_Count < 10000) & (dataframe.Genre == "Music")].head()

In [33]:
(dataframe
    .filter((pl.col('Video_Count') < 10000) & (pl.col('Genre') == "Music"))\
    .head(3)
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
10,11,"""Zee Music Comp...",92800000,54295114324,7768,"""Music""",2014
11,12,"""El Reino Infan...",55600000,54151900416,1434,"""Music""",2011
21,22,"""Canal KondZill...",66400000,37297721449,2393,"""Music""",2012


No pandas, fazemos:

    dataframe[dataframe.Genre.is_in(["Music", "Entertainment"])].head()

In [34]:
(dataframe
    .filter(pl.col('Genre').is_in(["Music", "Entertainment"]))
    .head(3)
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
7,8,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
10,11,"""Zee Music Comp...",92800000,54295114324,7768,"""Music""",2014


No pandas, fazemos:

    dataframe[dataframe.Channel_Name.str.contains("Music")].head()

In [35]:
(dataframe
    .filter(pl.col('Channel_Name').str.contains("Music"))
    .head(3)
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
10,11,"""Zee Music Comp...",92800000,54295114324,7768,"""Music""",2014
19,20,"""Wave Music""",56300000,39258909308,19031,"""Music""",2014
42,43,"""Sony Music Ind...",55900000,27598909527,3457,"""Music""",2009


No pandas, fazemos:

    dataframe[dataframe.Channel_Name.str.starts_with("Sony")].head()

In [36]:
(dataframe
    .filter(pl.col('Channel_Name').str.starts_with("Sony"))
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
42,43,"""Sony Music Ind...",55900000,27598909527,3457,"""Music""",2009
49,50,"""Sony PAL""",35900000,26412224144,22211,"""Film & Animati...",2014
52,53,"""SonyMusicIndia...",45200000,26094337405,3369,"""Music""",2009
145,146,"""SonyMusicSouth...",17400000,16774817862,5462,"""Music""",2014
260,261,"""Sony AATH""",18400000,12758281340,16954,"""Film & Animati...",2013


No pandas, fazemos:

    dataframe[dataframe.Channel_Name.str.ends_with("Music")].head()

In [37]:
(dataframe
    .filter(pl.col('Channel_Name').str.ends_with("Music"))
)

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
19,20,"""Wave Music""",56300000,39258909308,19031,"""Music""",2014
56,57,"""EminemMusic""",55400000,25713668530,140,"""Music""",2007
65,66,"""Aditya Music""",27600000,24574327626,21020,"""Music""",2008
134,135,"""Eros Now Music...",32100000,17427689200,9884,"""Entertainment""",2007
149,150,"""Ishtar Music""",39800000,16502312582,4589,"""Film & Animati...",2005
213,214,"""Saregama Music...",32500000,13910019443,6796,"""Entertainment""",2013


## Selecionando e operando

In [38]:
dataframe.select(pl.col("Subscriber_Count").mean().alias("Mean_Subscriber_Count"),
                 pl.col("Channel_Name").n_unique().alias("N_Channels"),
                 pl.col("Channel_Name").str.lengths().max().alias("Max_Lenght"))

Mean_Subscriber_Count,N_Channels,Max_Lenght
f64,u32,u32
31592000.0,296,51


In [39]:
dataframe.head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015


In [40]:
dataframe.select(pl.col(pl.Utf8).str.lengths().max().suffix("_max_lenght"),
                 pl.col(pl.Int64).mean().prefix("Mean_"))

Channel_Name_max_lenght,Genre_max_lenght,Mean_,Mean_Rank,Mean_Subscriber_Count,Mean_Video_Views,Mean_Video_Count,Mean_Channel_Started
u32,u32,f64,f64,f64,f64,f64,f64
51,16,149.760135,150.760135,31592000.0,21996000000.0,19174.331081,2011.368243


Selecionando todas as colunas (ou parte delas)

In [41]:
dataframe.select(pl.all()).head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015


In [42]:
dataframe.select(pl.col("*")).head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015


In [43]:
dataframe.select(pl.all().exclude("Genre")).head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Channel_Started
i64,i64,str,i64,i64,i64,i64
0,1,"""T-Series""",237000000,216495600668,18831,2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,2006
2,3,"""SET India""",152000000,140138068504,105649,2006
3,4,"""Sony SAB""",77500000,92952274861,65028,2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,2015


In [45]:
dataframe.head()

Unnamed: 0_level_0,Rank,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,6,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,7,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,8,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,9,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,10,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


# Ajustando Colunas

## Renomeando Colunas
Similar ao pandas

In [46]:
(dataframe
    .rename(mapping = {'Rank': 'Rank_Channels'}))

Unnamed: 0_level_0,Rank_Channels,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,i64,str,i64,i64,i64,str,i64
0,1,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,2,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,3,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,4,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,5,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,6,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,7,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,8,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,9,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,10,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


## Removendo uma coluna

In [48]:
dataframe.drop_in_place("Rank")

Rank
i64
1
2
3
4
5
6
7
8
9
10


In [49]:
dataframe

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


# Limpeza de Dados

## Valores Nulos
No pandas fazemos:

    df.isnull().values.any()

In [50]:
dataframe.null_count()

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0


In [51]:
(dataframe
    .filter(pl.col('Channel_Name').is_null())
    .head(3)
)

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64


In [52]:
(dataframe
    .filter(~ pl.col('Channel_Name').is_null())
    .head(3)
)

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006


No pandas, fazemos:
    
    dataframe["Channel_Name"].fillna("Sem nome", inplace = False)

In [53]:
(dataframe
     .with_columns(pl.col('Channel_Name').fill_null(pl.lit("Sem nome")))
)

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


Similar ao pandas

In [54]:
dataframe.drop_nulls()

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
3,"""Sony SAB""",77500000,92952274861,65028,"""Film & Animati...",2007
4,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
5,"""Like Nastya""",104000000,88060349741,762,"""People & Blogs...",2016
6,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
8,"""Movieclips""",58700000,58923017461,40063,"""Film & Animati...",2006
9,"""Colors TV""",60200000,58056997206,104523,"""Film & Animati...",2008


## Duplicados
Funciona similar ao pandas

In [55]:
dataframe.is_duplicated()

false
false
false
false
false
false
false
false
false
false
false


In [58]:
(dataframe
    .unique(subset = ['Genre'], keep = 'first'))

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006
4,"""✿ Kids Diana S...",108000000,88452629066,1070,"""People & Blogs...",2015
6,"""WWE""",93500000,74447865775,66901,"""Sports""",2007
7,"""Vlad and Niki""",93900000,73333582362,530,"""Entertainment""",2018
30,"""LankyBox""",23300000,31155421572,7307,"""Comedy""",2016
37,"""PewDiePie""",111000000,28920114696,4700,"""Gaming""",2010
58,"""5-Minute Craft...",78900000,25251693106,6022,"""Howto & Style""",2016
72,"""Aaj Tak""",55700000,23158451096,242010,"""News & Politic...",2009


# Operações

## Operações entre colunas para gerar uma nova coluna

In [59]:
(dataframe
    .with_columns((pl.col('Video_Views') / pl.col('Video_Count')).alias('Views_per_video'))
    .head(3)
)

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started,Views_per_video
i64,str,i64,i64,i64,str,i64,f64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006,11497000.0
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006,177280000.0
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006,1326400.0


In [60]:
dataframe.select((pl.col('Video_Views') / pl.col('Video_Count')).alias('Views_per_video'))

Views_per_video
f64
1.1497e7
1.7728e8
1.3264e6
1.4294e6
8.2666e7
1.1556e8
1.1128e6
1.3837e8
1.4708e6
555447.099739


In [61]:
(dataframe
    .with_columns(Views_per_video = pl.col('Video_Views') * pl.col('Video_Count'),
                 Subscribers_per_video = pl.col('Subscriber_Count') * pl.col('Video_Count'))
    .head(3))

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started,Views_per_video,Subscribers_per_video
i64,str,i64,i64,i64,str,i64,i64,i64
0,"""T-Series""",237000000,216495600668,18831,"""Music""",2006,4076828656179108,4462947000000
1,"""Cocomelon - Nu...",154000000,152638999634,861,"""Education""",2006,131422178684874,132594000000
2,"""SET India""",152000000,140138068504,105649,"""Film & Animati...",2006,14805446799379096,16058648000000


In [62]:
(dataframe
     .select(pl.col('Channel_Name', 'Video_Count', 'Channel_Started', 'Subscriber_Count'))
     .with_columns(
         new_constant_column = pl.lit(1),
         new_column_sum = pl.col('Video_Count').sum(),
         new_column_sum_with_constant = pl.col('Channel_Started') + 10000,
         new_column_case_when = pl.when(pl.col('Subscriber_Count') > 10000000).then(pl.lit("Canal Giga")).otherwise(pl.lit("Canal Ok"))
     )
)

Channel_Name,Video_Count,Channel_Started,Subscriber_Count,new_constant_column,new_column_sum,new_column_sum_with_constant,new_column_case_when
str,i64,i64,i64,i32,i64,i64,str
"""T-Series""",18831,2006,237000000,1,5675602,12006,"""Canal Giga"""
"""Cocomelon - Nu...",861,2006,154000000,1,5675602,12006,"""Canal Giga"""
"""SET India""",105649,2006,152000000,1,5675602,12006,"""Canal Giga"""
"""Sony SAB""",65028,2007,77500000,1,5675602,12007,"""Canal Giga"""
"""✿ Kids Diana S...",1070,2015,108000000,1,5675602,12015,"""Canal Giga"""
"""Like Nastya""",762,2016,104000000,1,5675602,12016,"""Canal Giga"""
"""WWE""",66901,2007,93500000,1,5675602,12007,"""Canal Giga"""
"""Vlad and Niki""",530,2018,93900000,1,5675602,12018,"""Canal Giga"""
"""Movieclips""",40063,2006,58700000,1,5675602,12006,"""Canal Giga"""
"""Colors TV""",104523,2008,60200000,1,5675602,12008,"""Canal Giga"""


## Value Counts

In [63]:
dataframe["Genre"].value_counts()

Genre,counts
str,u32
"""Gaming""",18
"""News & Politic...",11
"""Music""",113
"""Sports""",4
"""Entertainment""",72
"""Film & Animati...",24
"""People & Blogs...",27
"""Pets & Animals...",1
"""Comedy""",11
"""Howto & Style""",1


In [64]:
dataframe["Genre"].value_counts(sort=True)

Genre,counts
str,u32
"""Music""",113
"""Entertainment""",72
"""People & Blogs...",27
"""Film & Animati...",24
"""Gaming""",18
"""Education""",14
"""Comedy""",11
"""News & Politic...",11
"""Sports""",4
"""Howto & Style""",1


# Group By
Levemente parecido no pandas, com a diferença de que no Pandas essa operação costuma retornar um dataframe com MultiIndex que costuma ser um pouco trabalhoso de lidar.

In [65]:
dataframe.groupby('Channel_Started').agg(
    pl.col('Video_Count').count().alias('n'),
    pl.col('Video_Count').mean().alias('mean_video_count'),
    pl.col('Video_Count').median().alias('median_video_count'),
    pl.col('Video_Count').max().alias('max_video_count'),
    pl.col('Video_Count').min().alias('min_video_count'),
    pl.col('Video_Count').sum().cast(pl.Int64).alias('sum_video_count'),
    pl.col('Video_Count').quantile(0.1).alias('q10_video_count'),
    pl.col('Video_Count').quantile(0.9).alias('q90_video_count'),
    pl.col('Channel_Name').first().alias('first_name'),
    pl.col('Channel_Name').last().alias('last_name'),
    pl.col('Channel_Name').n_unique().alias('unique_names')
)

Channel_Started,n,mean_video_count,median_video_count,max_video_count,min_video_count,sum_video_count,q10_video_count,q90_video_count,first_name,last_name,unique_names
i64,u32,f64,f64,i64,i64,i64,f64,f64,str,str,u32
2010,16,25512.625,1555.5,230046,157,408202,177.0,120122.0,"""Shemaroo Filmi...","""DisneyJuniorUK...",16
2007,21,39324.238095,7985.0,345736,77,825809,146.0,66901.0,"""Sony SAB""","""GMA News""",21
2006,32,31093.65625,3940.5,257886,25,994997,163.0,105649.0,"""T-Series""","""Pitbull""",32
2012,31,18282.580645,3038.0,262326,166,566760,246.0,44772.0,"""Canal KondZill...","""JTBC Entertain...",31
2008,18,29639.611111,2407.0,187424,15,533513,120.0,109018.0,"""Colors TV""","""Selena Gomez""",18
2015,17,7312.823529,1315.0,66131,410,124318,474.0,29039.0,"""✿ Kids Diana S...","""CKN""",17
2020,6,1380.0,735.5,4311,445,8280,445.0,4311.0,"""Alan Chikin Ch...","""Junya.じゅんや""",6
2011,23,11041.869565,1685.0,89867,64,253963,133.0,25114.0,"""El Reino Infan...","""ItsFunneh""",23
2014,31,20023.612903,4156.0,159305,128,620732,460.0,84587.0,"""Zee Music Comp...","""eltrece""",31
2021,8,1929.5,1503.0,4913,371,15436,371.0,4913.0,"""LeoNata Family...","""XO TEAM""",8


É possível combinar filtros dentro da agregação durante o Group By, algo que não é possível no Pandas.

Queremos agrupar pelo ano do inicio do canal, separando a média pelos Gêneros.

In [66]:
(dataframe
    .groupby('Channel_Started', maintain_order = True)
    .agg(
        pl.col('Video_Count').filter(pl.col('Genre').is_in(["Music", "Entertainment"])).mean().alias("Video_Count_Fun"),
        pl.col('Video_Count').filter(pl.col('Genre').is_in(["Education"])).mean().alias("Video_Count_Edu")
        )
)

Channel_Started,Video_Count_Fun,Video_Count_Edu
i64,f64,f64
2006,12253.333333,764.0
2007,11223.375,
2015,10606.1,1232.0
2016,12413.25,1297.0
2018,18993.8,1122.0
2008,19456.083333,338.0
2014,31893.882353,520.5
2011,14905.4,2467.0
2013,17983.5,546.0
2005,21427.5,


In [67]:
(dataframe
    .filter(pl.col('Genre').is_in(["Music", "Entertainment"]))
    .groupby('Channel_Started', maintain_order = True)
    .agg(pl.col('Video_Count').mean().alias("Video_Count_Fun"))
)

Channel_Started,Video_Count_Fun
i64,f64
2006,12253.333333
2018,18993.8
2014,31893.882353
2011,14905.4
2015,10606.1
2016,12413.25
2005,21427.5
2012,11133.380952
2008,19456.083333
2007,11223.375


# Ordenando
Funciona similar ao pandas

In [69]:
(dataframe
    .sort(by = 'Video_Count', descending = True)
    .head(3))

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
299,"""GMA News""",12900000,11387663843,345736,"""News & Politic...",2007
288,"""ABP NEWS""",35500000,11840564703,262326,"""People & Blogs...",2012
181,"""IndiaTV""",34200000,15056762678,257886,"""News & Politic...",2006


In [70]:
(dataframe
    .sort(by = ['Video_Count', 'Video_Views'], descending = [True, False])
    .head(3))

Unnamed: 0_level_0,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
i64,str,i64,i64,i64,str,i64
299,"""GMA News""",12900000,11387663843,345736,"""News & Politic...",2007
288,"""ABP NEWS""",35500000,11840564703,262326,"""People & Blogs...",2012
181,"""IndiaTV""",34200000,15056762678,257886,"""News & Politic...",2006


# Lazy

In [71]:
dataframe_with_lazy_logic = pl.read_csv("top-300-youtube-channels.csv").lazy()
dataframe_with_lazy_logic.groupby('Channel_Started').agg(
    pl.col('Subscriber_Count').mean(),
    pl.col('Video_Views').mean(),
    pl.col('Video_Count').mean()
).sort('Subscriber_Count')

In [72]:
dataframe_with_lazy_logic = pl.read_csv("top-300-youtube-channels.csv").lazy()
dataframe_with_lazy_logic.groupby('Channel_Started').agg(
    pl.col('Subscriber_Count').mean(),
    pl.col('Video_Views').mean(),
    pl.col('Video_Count').mean()
).sort('Subscriber_Count').collect()

Channel_Started,Subscriber_Count,Video_Views,Video_Count
i64,f64,f64,f64
2021,15337500.0,15779000000.0,1929.5
2019,18300000.0,26199000000.0,2299.0
2020,18350000.0,15160000000.0,1380.0
2009,23944000.0,17301000000.0,13622.666667
2013,26303000.0,17794000000.0,12623.869565
2011,27998000.0,21754000000.0,11041.869565
2010,29843750.0,18520000000.0,25512.625
2005,31140000.0,19770000000.0,33678.3
2012,31535000.0,17469000000.0,18282.580645
2014,31553000.0,21352000000.0,20023.612903


# Convertendo Polars para Pandas

In [73]:
dataframe.to_pandas()

Unnamed: 0,Unnamed: 1,Channel_Name,Subscriber_Count,Video_Views,Video_Count,Genre,Channel_Started
0,0,T-Series,237000000,216495600668,18831,Music,2006
1,1,Cocomelon - Nursery Rhymes,154000000,152638999634,861,Education,2006
2,2,SET India,152000000,140138068504,105649,Film & Animation,2006
3,3,Sony SAB,77500000,92952274861,65028,Film & Animation,2007
4,4,✿ Kids Diana Show,108000000,88452629066,1070,People & Blogs,2015
...,...,...,...,...,...,...,...
291,295,Jake Fellman,15000000,11618626248,440,Entertainment,2012
292,296,DisneyJuniorUK,11300000,11586962179,2789,Entertainment,2010
293,297,JTBC Entertainment,7580000,11512332695,76838,Entertainment,2012
294,298,Mnet TV,9040000,11442069884,21617,Entertainment,2013


# Lógicas de JOIN

In [None]:
df_customers = pl.DataFrame(
    {
        "customer_id": [1, 2, 3],
        "name": ["Alice", "Bob", "Charlie"],
    }
)

df_orders = pl.DataFrame(
    {
        "order_id": ["a", "b", "c"],
        "customer_id": [1, 2, 2],
        "amount": [100, 200, 300],
    }
)

In [None]:
df_left_join = df_customers.join(df_orders, on="customer_id", how="left")
print(df_left_join)

In [None]:
df_outer_join = df_customers.join(df_orders, on="customer_id", how="outer")
print(df_outer_join)

In [None]:
df_colors = pl.DataFrame(
    {
        "color": ["red", "blue", "green"],
    }
)

df_sizes = pl.DataFrame(
    {
        "size": ["S", "M", "L"],
    }
)

In [None]:
df_cross_join = df_colors.join(df_sizes, how="cross")
print(df_cross_join)