In [11]:
import pandas as pd
import altair as alt
from altair_saver import save
import statistics

In [12]:
df = pd.read_csv('data/FacebookAdLibraryReport_2020-12-08_MX_lifelong_Colima.csv')
df.columns

Index(['Page ID', 'Page Name', 'Disclaimer', 'Amount Spent (MXN)'], dtype='object')

In [13]:
df.columns = ['page_id','page_name','disclaimer','amount_spent']

In [14]:
df['amount_spent'] = pd.to_numeric(df['amount_spent'].str.replace('≤', ' '))

In [15]:
df_1 = pd.DataFrame(df.groupby('page_name')['amount_spent'].sum()).reset_index()
df_2 = pd.DataFrame(df.groupby('disclaimer')['amount_spent'].sum()).reset_index() 

In [16]:
categorical = df.select_dtypes(include=['object', 'bool', 'category', 'datetime64[ns]'])

for col in categorical.columns:
  print('Valores únicos en "{}": {}'.format(col,categorical[col].nunique()))

Valores únicos en "page_name": 613
Valores únicos en "disclaimer": 465


In [17]:
total = df.shape[0]
for col in categorical.columns:
  print('Porcentaje de valores únicos en "{}": {}%'.format(col,round(categorical[col].nunique()/total*100,2)))

Porcentaje de valores únicos en "page_name": 88.84%
Porcentaje de valores únicos en "disclaimer": 67.39%


In [18]:
# Moda - valor que más se repite en cada columna excepto los indicadores únicos
for col in categorical.drop(columns='disclaimer').columns:
  print('Valor más popular en "{}": {}'.format(col, statistics.mode(categorical[col])))

Valor más popular en "page_name": De Política y Algo Más.


In [19]:
#table page_name
counts_page = df.page_name.value_counts()
percent_page = df.page_name.value_counts(normalize=True)
percent100_page = df.page_name.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'counts': counts_page, 'per': percent_page, 'per100': percent100_page})

Unnamed: 0,counts,per,per100
Elecciones&Encuestas,4,0.005814,0.6%
De Política y Algo Más.,4,0.005814,0.6%
Desde abajo y con la gente,3,0.004360,0.4%
Claudia Yáñez Centeno,3,0.004360,0.4%
Alejandro Rojas Díaz Durán,3,0.004360,0.4%
...,...,...,...
Periodico Hechos Pensamiento En Accion,1,0.001453,0.1%
Pepe Couttolenc,1,0.001453,0.1%
Colima con AMLO,1,0.001453,0.1%
José Juan Espinosa,1,0.001453,0.1%


In [20]:
#table page_name
counts_disclaimer = df.disclaimer.value_counts()
percent_disclaimer = df.disclaimer.value_counts(normalize=True)
percent100_disclaimer = df.disclaimer.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'counts': counts_disclaimer, 'per': percent_disclaimer, 'per100': percent100_disclaimer})

Unnamed: 0,counts,per,per100
These ads ran without a disclaimer,204,0.295652,29.6%
SOME CONTENT MARKETING,3,0.004348,0.4%
Movimiento Ciudadano,3,0.004348,0.4%
Arturo Vázquez Rodríguez,3,0.004348,0.4%
Logística Digital,2,0.002899,0.3%
...,...,...,...
Jorge Armando Benítez Martin del Campo,1,0.001449,0.1%
Meraki Mkt 360,1,0.001449,0.1%
Hugo Junco Olvera,1,0.001449,0.1%
Virgilio Manuel Andrade Gonzalez,1,0.001449,0.1%


In [21]:
df2 = df.groupby(by=df.columns, axis=1).sum()

In [31]:
%run ggplot2_theme.py



page_name = alt.Chart(df_1).mark_bar().encode(
    x = alt.X('amount_spent:Q', title = "Monto gastado en MXN"),
    y = alt.Y('page_name:N', sort='-x', title = "Página"),
    color = 'amount_spent'
).transform_window(
  rank='rank(amout_spent))',
  sort=[alt.SortField('amount_spent', order='descending')]
).transform_filter(
  alt.datum.rank <= 20
).properties(
    width = 500,
    height= 500,
    title = "Monto gastado en MXN por página de FB"
)

page_name_text = page_name.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='amount_spent:Q'
)

page_name = page_name+page_name_text
page_name


