In [800]:
# Dependencies and Setup
import os
import pandas as pd


path = os.path.join("Resources", "purchase_data.csv")

# Read Purchasing File and store into Pandas data frame
df = pd.read_csv(path)
# df = pd.read_csv("../jupyter/UTA-AUS-DATA-PT-07-2020-U-C/homework/04-Pandas/Instructions/HeroesOfPymoli/Resources/purchase_data.csv")

In [801]:
df

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46
778,778,Sisur91,7,Male,101,Final Critic,4.19


In [802]:
# Checking data types
df.dtypes

Purchase ID      int64
SN              object
Age              int64
Gender          object
Item ID          int64
Item Name       object
Price          float64
dtype: object

In [803]:
# Checking if duplicate signatures (SN) exist
df["SN"].value_counts()

Lisosia93      5
Idastidru52    4
Iral74         4
Siallylis44    3
Rarallo90      3
              ..
Eurisuru25     1
Qilunan34      1
Chanosia34     1
Quanrion96     1
Jeyciman68     1
Name: SN, Length: 576, dtype: int64

In [804]:
# Player Count
# Assumption: Same names represent the same person

# Total players by counting unique signatures
totplayers = df["SN"].nunique()

In [805]:
# Print results as dataframe
totplayers_dict = {"Total Players": [totplayers]}
df_totplayers = pd.DataFrame(totplayers_dict, columns = ["Total Players"])
df_totplayers

Unnamed: 0,Total Players
0,576


In [806]:
# Purchasing Analysis (Total) - PAT

# Unique item count
p1 = df["Item ID"].nunique()

In [807]:
# Average purchase price
p2 = "$" + str(round((df.loc[df["Item ID"].unique(), "Price"].mean()), 2))

In [808]:
# Total number of purchases
p3 = df["Purchase ID"].nunique()

In [809]:
# Total revenue
p4 = "$" + str('{:,}'.format(round((df["Price"].sum()), 2)))

In [810]:
# Print Purchasing Analysis (Total) results
p_df = pd.DataFrame({
    "Number of Unique Items": [p1], 
    "Average Price": [p2], 
    "Number of Purchases": [p3], 
    "Total Revenue": [p4]})

p_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,183,$3.11,780,"$2,379.77"


In [811]:
# Gender demographics

# Count of Male players
M_count = (df[df["Gender"] == "Male"]["SN"].unique()).size

# Percentage of Male players    
M_percent = str(round((M_count/totplayers*100), 2)) + "%"

In [812]:
#Count of Female players
F_count = (df[df["Gender"] == "Female"]["SN"].unique()).size

# Percentage of Female players    
F_percent = str(round((F_count/totplayers*100), 2)) + "%"

In [813]:
#Count of Other / Non-Disclosed players
ON_count = (df[df["Gender"] == "Other / Non-Disclosed"]["SN"].unique()).size

# Percentage of Female players    
ON_percent = str(round((ON_count/totplayers*100), 2)) + "%"

In [814]:
gender_demo = ({"Total Count":[F_count, M_count, ON_count],
              "Percentage of Players":[F_percent, M_percent, ON_percent]})

gender_demo_df = pd.DataFrame(gender_demo, index = ["Female", "Male", "Other / Non-Disclosed"])

gender_demo_df

Unnamed: 0,Total Count,Percentage of Players
Female,81,14.06%
Male,484,84.03%
Other / Non-Disclosed,11,1.91%


In [815]:
#Purchasing analysis (Gender)

# Purchase count by Gender
F_pcount = df.loc[df["Gender"] == "Female"]["Purchase ID"].count()
M_pcount = df.loc[df["Gender"] == "Male"]["Purchase ID"].count()
ON_pcount = df.loc[df["Gender"] == "Other / Non-Disclosed"]["Purchase ID"].count()

In [816]:
# Average purchase price by Gender

F_avgpp = format((df.loc[df["Gender"] == "Female", "Price"].mean()), ".2f")
M_avgpp = format((df.loc[df["Gender"] == "Male", "Price"].mean()), ".2f")
ON_avgpp = format((df.loc[df["Gender"] == "Other / Non-Disclosed", "Price"].mean()), ".2f") 

In [817]:
# Total purchase value by Gender

F_tpv = df.loc[df["Gender"] == "Female"]["Price"].sum()
M_tpv = df.loc[df["Gender"] == "Male"]["Price"].sum()
ON_tpv = df.loc[df["Gender"] == "Other / Non-Disclosed"]["Price"].sum()

In [818]:
# Average purchase total per person by Gender

F_avgpt = format((F_tpv/F_count), ".2f")
M_avgpt = format((M_tpv/M_count), ".2f")
ON_avgpt = format((ON_tpv/ON_count), ".2f")

In [819]:
# Saving results to new dataframe and printing table

panal_gender_df = pd.DataFrame({ 
                "Gender":["Female", "Male", "Other / Non-Disclosed"], 
                "Purchase Count":[F_pcount, M_pcount, ON_pcount], 
                "Average Purchase Price":[F_avgpp, M_avgpp, ON_avgpp],
                "Total Purchase Value":[F_tpv, M_tpv, ON_tpv],
                "Avg Total Purchase per Person":[F_avgpt, M_avgpt, ON_avgpt]})

df_indexed = panal_gender_df.set_index("Gender")
df_indexed["Average Purchase Price"] = df_indexed["Average Purchase Price"].apply(lambda avgpp: f"${avgpp}")
df_indexed["Total Purchase Value"] = df_indexed["Total Purchase Value"].apply(lambda tpv: f"${tpv}")
df_indexed["Avg Total Purchase per Person"] = df_indexed["Avg Total Purchase per Person"].apply(lambda avgpt: f"${avgpt}")
df_indexed

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.20,$361.94,$4.47
Male,652,$3.02,$1967.64,$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [820]:
# Age demographics

df["Age"].min()

7

In [821]:
df["Age"].max()

45

In [822]:
# Average purchase price by Age Group

# Creating age categories by binning
df_bin = (pd.cut(df["Age"], bins=[5, 10, 15, 20, 25, 30, 35, 40, 46], labels=["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"], right=False))

# Total count of players per Age group
play_ct_age = df.groupby(df_bin)["SN"].nunique()

# Percentage of players per Age group
play_pt_age = round((((df.groupby(df_bin)["SN"].nunique()) / totplayers)*100), 2)

In [823]:
# Age demographic table

age_demo_df = pd.DataFrame({"Total Count":play_ct_age,
                     "Percentage of Players":play_pt_age})

age_demo_df2 = pd.DataFrame(age_demo_df)

age_demo_df2



Unnamed: 0_level_0,Total Count,Percentage of Players
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,17,2.95
10-14,22,3.82
15-19,107,18.58
20-24,258,44.79
25-29,77,13.37
30-34,52,9.03
35-39,31,5.38
40+,12,2.08


In [824]:
# Purchase count by Age
pur_ct_age = df.groupby(df_bin)["Purchase ID"].nunique()

# Avg purchase price by Age
avg_pp_age = round((df.groupby(df_bin)["Price"].mean()), 2).apply(lambda avg_pp_age: f"${avg_pp_age}")

# Total purchase value by Age
tpv_age = round((df.groupby(df_bin)["Price"].sum()), 2).apply(lambda tpv_age: f"${tpv_age}")

# Number of people per age group
peop_age = df.groupby(df_bin)["SN"].nunique()

# Average purchase total per person by Age Group
avg_pp_age = round(((df.groupby(df_bin)["Price"].sum())/peop_age), 2).apply(lambda avg_pp_age: f"${avg_pp_age}")

In [825]:
# Purchasing analysis table (by Age)

panal_age_df = pd.DataFrame({ 
                "Age Ranges":["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"], 
                "Purchase Count":pur_ct_age, 
                "Average Purchase Price":avg_pp_age,
                "Total Purchase Value":tpv_age,
                "Avg Total Purchase per Person":avg_pp_age})


panal_age_df2 = pd.DataFrame(panal_age_df)

panal_age_df2

Unnamed: 0_level_0,Age Ranges,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<10,<10,23,$4.54,$77.13,$4.54
10-14,10-14,28,$3.76,$82.78,$3.76
15-19,15-19,136,$3.86,$412.89,$3.86
20-24,20-24,365,$4.32,$1114.06,$4.32
25-29,25-29,101,$3.81,$293.0,$3.81
30-34,30-34,73,$4.12,$214.0,$4.12
35-39,35-39,41,$4.76,$147.67,$4.76
40+,40+,13,$3.19,$38.24,$3.19


In [826]:
# for label, content in df.items():
#     print(f'label: {label}')
#     print(f'content: {content}', sep='\n')

In [827]:
# from pprint import pprint
# # creating a dictionary of lists, every bin item gets a list of purchase ID's
# purchases_dict = {}
# for key in df_bin.keys():
#     bin_item = df_bin[key]
#     if bin_item not in purchases_dict:
#         purchases_dict[bin_item] = [key]
#     else:
#         purchases_dict[bin_item].append(key)

# #pprint(purchases_dict)
# # age_groups = df.groupby(['Age', pd.cut(df, df_bin)])
# # age_groups


In [828]:
# Total purchase value by Age

In [829]:
# Average purchase total per person by Age

In [830]:
# Top spenders

df_uniqbuyer = df['SN'].value_counts()
df_uniqbuyer

Lisosia93      5
Idastidru52    4
Iral74         4
Siallylis44    3
Rarallo90      3
              ..
Eurisuru25     1
Qilunan34      1
Chanosia34     1
Quanrion96     1
Jeyciman68     1
Name: SN, Length: 576, dtype: int64

In [831]:
# Most popular items

# Retrieving relevant data

df_pop = df.drop(
        [
            "Purchase ID",
            "SN",
            "Age",
            "Gender",
        ],
        axis=1,
)

df_pop

Unnamed: 0,Item ID,Item Name,Price
0,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,143,Frenzied Scimitar,1.56
2,92,Final Critic,4.88
3,100,Blindscythe,3.27
4,131,Fury,1.44
...,...,...,...
775,60,Wolf,3.54
776,164,Exiled Doomblade,1.63
777,67,"Celeste, Incarnation of the Corrupted",3.46
778,101,Final Critic,4.19


In [832]:
df.groupby(["Item ID", "Item Name"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1dc7781710>