# Day 76
app store data

In [None]:
import pandas as pd
import plotly.express as px
# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format
df_apps = pd.read_csv('apps.csv')

In [None]:
#df_apps.shape
df_apps.columns
#df_apps.sample()

In [None]:
#limpeza dos dados -> tratamento
df_apps.drop(['Last_Updated', 'Android_Ver'], axis=1, inplace=True) #removendo colunas que nao vao ser utilizadas

df_apps_clean = df_apps.dropna()
df_apps_clean = df_apps_clean.drop_duplicates(subset=['App','Type','Price'])

In [None]:
#data analysys

#df_apps_clean.sort_values(by='Size_MBs', ascending=False).head()
#df_apps_clean.sort_values(by='Rating', ascending=False).head()
#df_apps_clean.sort_values(by='Reviews', ascending=False).head()
content_ratings = df_apps_clean.Content_Rating.value_counts()
print(content_ratings)

In [None]:
#pizza graphic of content ratings
pizza = px.pie(labels=content_ratings.index, values=content_ratings.values,
               title='Content Rating Distribution',names=content_ratings.index)
pizza.update_traces(textposition='outside', textinfo='percent+label')
pizza.show()

In [None]:
#donut graphic of content ratings
donut = px.pie(labels=content_ratings.index, values=content_ratings.values,
               title='Content Rating Distribution',names=content_ratings.index, hole=0.3)
donut.update_traces(textposition='inside', textfont_size=15, textinfo='percent')
donut.show()

In [None]:
#conversao de valores numericos com ruido para strings limpas by parsing/replacing
df_apps_clean.Installs = df_apps_clean.Installs.astype(str).str.replace(',', "")
df_apps_clean.Installs = pd.to_numeric(df_apps_clean.Installs)
df_apps_clean[['App', 'Installs']].groupby('Installs').count()
#mesmo processo, variaveis diferentes
df_apps_clean.Price = df_apps_clean.Price.astype(str).str.replace('$', "")
df_apps_clean.Price = pd.to_numeric(df_apps_clean.Price)

df_apps_clean = df_apps_clean[df_apps_clean['Price'] < 250]#remoçao de apps outliers
#df_apps_clean.sort_values('Price', ascending=False).head(5)

df_apps_clean['Revenue_Estimate'] = df_apps_clean.Installs.mul(df_apps_clean.Price) #calculo da receita bruta de apps pagos
df_apps_clean.sort_values('Revenue_Estimate', ascending=False)[:10]#top 10 apps pagos

In [None]:
#grafico de barras de número de apps lançados por categoria
#print(f'There is {df_apps_clean.Category.nunique()} app categories.')
apps_per_category = df_apps_clean.Category.value_counts()

bar_graph = px.bar(x = apps_per_category[:10].index, # index => category name
             y = apps_per_category[:10].values) #top 10 categories

bar_graph.update_layout(xaxis_title='Categories', yaxis_title='App Count')
bar_graph.show()

In [None]:
#grafico de categoria mais INSTALADA
category_installs = df_apps_clean.groupby('Category').agg({'Installs': pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)

hor_bar_graph = px.bar(x = category_installs.Installs,
               y = category_installs.index,
               orientation='h',
               title='Category Popularity')

hor_bar_graph.update_layout(xaxis_title='Downloads', yaxis_title='Category')
hor_bar_graph.show()

In [None]:
#grafico distributivo de downloads de apps por categoria (obs: passar mouses no plots)
category_count = df_apps_clean.groupby('Category').agg({'App': pd.Series.count})

category_df = pd.merge(category_count, category_installs, on='Category', how="inner")
category_df.sort_values('Installs', ascending=False)


scatter_graph = px.scatter(category_df, #origem
                    x='App',
                    y='Installs',
                    title='Category Concentration Scattered Graph',
                    size='App',
                    hover_name=category_df.index,
                    color='Installs')

scatter_graph.update_layout(xaxis_title="Number of Apps (If Lower = More Concentrated)",
                      yaxis_title="Installs",
                      yaxis=dict(type='log'))

scatter_graph.show()

In [None]:
#parsing de generos multiplos (mais de um em um app) de apps
stash = df_apps_clean.Genres.str.split(';', expand=True).stack()
num_genres = stash.value_counts()
#print(f'Number of genres: {len(num_genres)}')

graph_bar = px.bar(x = num_genres.index[:10], # index = category name
             y = num_genres.values[:10], # qtd
             title='Top Genres',
             hover_name=num_genres.index[:10],
             color=num_genres.values[:10],
             color_continuous_scale='picnic')

graph_bar.update_layout(xaxis_title='Genre',
yaxis_title='Number of Apps',
coloraxis_showscale=False)

graph_bar.show()

In [None]:
#qtd de apps pagos e apps gratuitos por categoria

paid_or_free = df_apps_clean.groupby(["Category", "Type"], as_index=False).agg({'App': pd.Series.count})
paid_or_free.head()

g_bar = px.bar(paid_or_free,
               x='Category',
               y='App',
               title='Free vs Paid Apps by Category',
               color='Type',
               barmode='group')

g_bar.update_layout(xaxis_title='Category',
                    yaxis_title='Apps Count',
                    xaxis={'categoryorder':'total descending'},
                    yaxis=dict(type='log'))

g_bar.show()

In [None]:
#box plot dos dados acima

box_plot = px.box(df_apps_clean,
             y='Installs',
             x='Type',
             color='Type',
             notched=True,
             points='all',
             title='Lost Downloads in Paid Apps')

box_plot.update_layout(yaxis=dict(type='log'))

box_plot.show()

In [None]:
#revenue de apps pagos
df_paid_apps = df_apps_clean[df_apps_clean['Type'] == 'Paid']
box_plot = px.box(df_paid_apps,
             x='Category',
             y='Revenue_Estimate',
             title='Paid apps earnings')

box_plot.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Revenue',
                  xaxis={'categoryorder':'min ascending'},
                  yaxis=dict(type='log'))


box_plot.show()

In [None]:
#media de preços dos apps pagos
box_plot = px.box(
    df_paid_apps,
    x='Category',
    y='Price',
    title='Price per Category'
)

box_plot.update_layout(
    xaxis_title='Category',
    yaxis_title='Price',
    xaxis={'categoryorder': 'max descending'},
    yaxis=dict(type='log')
)

box_plot.show()