## Just getting things set up here.

In [2]:
import pandas as pd
import numpy as np

file = "Resources/purchase_data.csv"

df = pd.read_csv(file)
df.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


## Player Count and Purchasing Analysis (Total)

In [3]:
# Calculate summary stats.
unique_user_count = df['SN'].nunique()
unique_items = df['Item ID'].nunique()
avg_price = df['Price'].mean()
purchase_count = df['Purchase ID'].count()
ttl_revenue = df['Price'].sum()

# Create a dataframe of these summary stats.
summary_df = pd.DataFrame({"Total Players": [unique_user_count],
                           "Number of Unique Items": [unique_items],
                           "Average Purchase Price": avg_price,
                           "Total Number of Purchases": [purchase_count],
                           "Total Revenue": [ttl_revenue]}
                         )

# Format values. Make them pretty!
summary_df["Total Revenue"] = summary_df["Total Revenue"].map("${:.2f}".format)
summary_df["Average Purchase Price"] = summary_df["Average Purchase Price"].map("${:.2f}".format)

# Show the summary dataframe.
summary_df

Unnamed: 0,Total Players,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,576,183,$3.05,780,$2379.77


## Gender Demographics

In [4]:
# Calculate stats for the Gender series.
gender_pct = (df.Gender.value_counts(normalize=True)*100).map("{:.1f}%".format)
gender_count = df.Gender.value_counts()

# Merge the counts and percent into one dataframe.
gender_stats = pd.concat([gender_count,gender_pct], axis=1)
gender_stats.columns = ['Count', 'Percent']

# Percents don't match started Notebook, but I confirmed in the CSV that they are right.
gender_stats

Unnamed: 0,Count,Percent
Male,652,83.6%
Female,113,14.5%
Other / Non-Disclosed,15,1.9%


## Purchasing Analysis (Gender)

In [86]:
# Group the data frame by gender and extract a number of stats from each group.
df_gender = df.groupby(['Gender']).agg({'Purchase ID': "count",'Price': ["mean","sum"], "SN": pd.Series.nunique})

# Remove multi-level index.
df_gender.columns = [''.join(col).strip() for col in df_gender.columns.values]

# Calculate average per person per gender.
df_gender['Average Purchase Total per Person'] = pd.to_numeric(df_gender['Pricesum'])/df_gender['SNnunique']

# Format values. Make them pretty!
df_gender["Pricemean"] = df_gender["Pricemean"].map("${:.2f}".format)
df_gender["Pricesum"] = df_gender["Pricesum"].map("${:.2f}".format)
df_gender["Average Purchase Total per Person"] = df_gender["Average Purchase Total per Person"] .map("${:.2f}".format)
df_gender["SNnunique"] = df_gender["SNnunique"].map("{:,.0f}".format)

# Rename columns.
df_gender = df_gender.rename(columns={'Purchase IDcount':'Purchase Count','Pricemean':"Average Purchase Price","Pricesum":"Total Purchase Value", "SNnunique":'Total Purchasers'})

df_gender

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Total Purchasers,Average Purchase Total per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,113,$3.20,$361.94,81,$4.47
Male,652,$3.02,$1967.64,484,$4.07
Other / Non-Disclosed,15,$3.35,$50.19,11,$4.56


## Age Demographics

In [37]:
# Create age groups and add the group labels to the dataframe in a new column.
bins = [0, 10, 15, 20, 25, 30, 35, 40, 45, 50]
group_labels = ["< 10", "11 to 15", "16 to 20", "21 to 25", "26 to 30", "31 to 35", "36 to 40", "41-45", "46+"]
df["Age Group"] = pd.cut(df["Age"], bins, labels=group_labels)

# Group the data frame by month and item and extract a number of stats from each group
df_age = df.groupby(['Age Group']).agg({'Purchase ID': "count", 'Price': ["mean","sum"],"SN": pd.Series.nunique})

# Remove multi-level index.
df_age.columns = [''.join(col).strip() for col in df_age.columns.values]

# Calculate average per person per gender.
df_age['Average Purchase Total per Person'] = pd.to_numeric(df_age['Pricesum'])/df_age['SNnunique']

# Format data.
df_age["Pricemean"] = df_age["Pricemean"].map("${:.2f}".format)
df_age["Pricesum"] = df_age["Pricesum"].map("${:.2f}".format)
df_age["Average Purchase Total per Person"] = df_age["Average Purchase Total per Person"] .map("${:.2f}".format)
df_age["SNnunique"] = df_age["SNnunique"].map("{:,.0f}".format)

# Rename columns.
df_age = df_age.rename(columns={'Purchase IDcount': 'Purchase Count', 'Pricemean': 'Average Purchase Price', 'Pricesum': 'Total Purchase Value', 'SNnunique':'Total Purchasers'})

df_age


Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Total Purchasers,Average Purchase Total per Person
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
< 10,32,$3.40,$108.96,24.0,$4.54
11 to 15,54,$2.90,$156.60,41.0,$3.82
16 to 20,200,$3.11,$621.56,150.0,$4.14
21 to 25,325,$3.02,$981.64,232.0,$4.23
26 to 30,77,$2.88,$221.42,59.0,$3.75
31 to 35,52,$2.99,$155.71,37.0,$4.21
36 to 40,33,$3.40,$112.35,26.0,$4.32
41-45,7,$3.08,$21.53,7.0,$3.08
46+,0,$nan,$0.00,,$nan


## Top Spenders

In [110]:
# Identify top spenders.
top_spenders = df.groupby('SN')['Price'].sum().sort_values(ascending=False).head(5).reset_index()
top_spenders = top_spenders['SN']

# Group the data by user and create aggregate measures.
df_spend = df.groupby(['SN']).agg({'Purchase ID': "count",'Price': ["mean","sum"]}).reset_index()

# Remove multi-level index.
df_spend.columns = [''.join(col).strip() for col in df_spend.columns.values]

# Select only top five spenders.
# df_spend.nlargest(5, pd.to_numeric(df_spend['Pricesum'])) 

# Format values. Make them pretty!
df_spend["Pricemean"] = df_spend["Pricemean"].map("${:.2f}".format)
df_spend["Pricesum"] = df_spend["Pricesum"].map("${:.2f}".format)

# Rename columns. Why does this not work consistently?
df_spend.rename(columns={'Purchase IDcount':'Purchase Count','Pricemean':"Average Purchase Price","Pricesum":"Total Purchase Value"})

# Subset to look at only top spenders.
df_spend[df_spend['SN'].isin(top_spenders)].sort_values('Pricesum',ascending=False)

# TUTOR - Why does this not match top spenders in next box?
# test = df_spend.sort_values(by='Pricesum', ascending=False)
# print(test)
#print(df_spend.loc["Lisosia93"])



Unnamed: 0,SN,Purchase IDcount,Pricemean,Pricesum
360,Lisosia93,5,$3.79,$18.96
246,Idastidru52,4,$3.86,$15.45
106,Chamjask73,3,$4.61,$13.83
275,Iral74,4,$3.40,$13.62
281,Iskadarya95,3,$4.37,$13.10


## Most Popular Items

In [126]:
# Identify top products.
items = df.groupby('Item ID')['Price'].sum().sort_values(ascending=False).head(5).reset_index()
items = items['Item ID']

# Group the data by product and create aggregate measures.
top_items = df.groupby(['Item ID','Item Name']).agg({'Purchase ID': "count",'Price': ["mean","sum"]}).reset_index()

# Remove multi-level index.
top_items.columns = [''.join(col).strip() for col in top_items.columns.values]

# Format values. Make them pretty!
top_items["Pricemean"] = top_items["Pricemean"].map("${:.2f}".format)
top_items["Pricesum"] = top_items["Pricesum"].map("${:.2f}".format)

# Subset to look at only top items.
top_five_items_count = top_items[top_items['Item ID'].isin(items)].sort_values('Purchase IDcount',ascending=False)

# Rename columns.
top_five_items_count.rename(columns={'Purchase IDcount':'Purchase Count','Pricemean':"Item Price","Pricesum":"Total Purchase Value"}).reset_index(drop=True)



Unnamed: 0,Item ID,Item Name,Purchase Count,Item Price,Total Purchase Value
0,178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
1,82,Nirvana,9,$4.90,$44.10
2,145,Fiery Glass Crusader,9,$4.58,$41.22
3,92,Final Critic,8,$4.88,$39.04
4,103,Singed Scalpel,8,$4.35,$34.80


## Most Profitable Items

In [127]:
# Re-using dataframe from prior cell, subset to look at only top items by total purchase value.
top_five_items_value = top_items[top_items['Item ID'].isin(items)].sort_values('Pricesum',ascending=False)

# Rename columns.
top_five_items_value.rename(columns={'Purchase IDcount':'Purchase Count','Pricemean':"Item Price","Pricesum":"Total Purchase Value"}).reset_index(drop=True)

Unnamed: 0,Item ID,Item Name,Purchase Count,Item Price,Total Purchase Value
0,178,"Oathbreaker, Last Hope of the Breaking Storm",12,$4.23,$50.76
1,82,Nirvana,9,$4.90,$44.10
2,145,Fiery Glass Crusader,9,$4.58,$41.22
3,92,Final Critic,8,$4.88,$39.04
4,103,Singed Scalpel,8,$4.35,$34.80
