In [1159]:
# Dependencies and Setup
import os
import pandas as pd


path = os.path.join("Resources", "purchase_data.csv")

# Read Purchasing File and store into Pandas data frame
df = pd.read_csv(path)
# df = pd.read_csv("../jupyter/UTA-AUS-DATA-PT-07-2020-U-C/homework/04-Pandas/Instructions/HeroesOfPymoli/Resources/purchase_data.csv")

In [1160]:
df

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46
778,778,Sisur91,7,Male,101,Final Critic,4.19


In [1161]:
# Checking data types
df.dtypes

Purchase ID      int64
SN              object
Age              int64
Gender          object
Item ID          int64
Item Name       object
Price          float64
dtype: object

In [1162]:
# Checking if duplicate screen names (SN) exist
df["SN"].value_counts()

Lisosia93      5
Idastidru52    4
Iral74         4
Siallylis44    3
Rarallo90      3
              ..
Eurisuru25     1
Qilunan34      1
Chanosia34     1
Quanrion96     1
Jeyciman68     1
Name: SN, Length: 576, dtype: int64

In [1163]:
# Player Count
# Assumption: Same names represent the same person

# Total players by counting unique screen names
totplayers = df["SN"].nunique()

In [1164]:
# Print results as dataframe
totplayers_dict = {"Total Players": [totplayers]}
df_totplayers = pd.DataFrame(totplayers_dict, columns = ["Total Players"])
df_totplayers

Unnamed: 0,Total Players
0,576


In [1165]:
# Purchasing Analysis (Total) - PAT

# Unique item count
p1 = df["Item ID"].nunique()
p1

183

In [1203]:
# Average purchase price
p2 = "$" + str(round((df.loc[df["Item ID"].unique(), "Price"].mean()), 2))
p2

'$3.11'

In [1167]:
# Total number of purchases
p3 = df["Purchase ID"].nunique()
p3

780

In [1168]:
# Total revenue
p4 = "$" + str('{:,}'.format(round((df["Price"].sum()), 2)))
p4

'$2,379.77'

In [1169]:
# Print Purchasing Analysis (Total) results
p_df = pd.DataFrame({
    "Number of Unique Items": [p1], 
    "Average Price": [p2], 
    "Number of Purchases": [p3], 
    "Total Revenue": [p4]})

p_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,183,$3.11,780,"$2,379.77"


In [1170]:
# Gender demographics

# Count of Male players
M_count = (df[df["Gender"] == "Male"]["SN"].unique()).size
M_count

484

In [1171]:
# Percentage of Male players    
M_percent = str(round((M_count/totplayers*100), 2)) + "%"
M_percent

'84.03%'

In [1172]:
# Count of Female players
F_count = (df[df["Gender"] == "Female"]["SN"].unique()).size
F_count

81

In [1173]:
# Percentage of Female players    
F_percent = str(round((F_count/totplayers*100), 2)) + "%"
F_percent

'14.06%'

In [1174]:
#Count of Other / Non-Disclosed players
ON_count = (df[df["Gender"] == "Other / Non-Disclosed"]["SN"].unique()).size
ON_count

11

In [1175]:
# Percentage of Female players    
ON_percent = str(round((ON_count/totplayers*100), 2)) + "%"
ON_percent

'1.91%'

In [1176]:
gender_demo = ({"Total Count":[F_count, M_count, ON_count],
              "Percentage of Players":[F_percent, M_percent, ON_percent]})

gender_demo_df = pd.DataFrame(gender_demo, index = ["Female", "Male", "Other / Non-Disclosed"])

gender_demo_df

Unnamed: 0,Total Count,Percentage of Players
Female,81,14.06%
Male,484,84.03%
Other / Non-Disclosed,11,1.91%


In [1177]:
#Purchasing analysis (Gender)

# Purchase count by Gender
F_pcount = df.loc[df["Gender"] == "Female"]["Purchase ID"].count()
F_pcount

113

In [1178]:
M_pcount = df.loc[df["Gender"] == "Male"]["Purchase ID"].count()
M_pcount

652

In [1179]:
ON_pcount = df.loc[df["Gender"] == "Other / Non-Disclosed"]["Purchase ID"].count()
ON_pcount

15

In [1180]:
# Average purchase price by Gender

F_avgpp = format((df.loc[df["Gender"] == "Female", "Price"].mean()), ".2f")
F_avgpp

'3.20'

In [1181]:
M_avgpp = format((df.loc[df["Gender"] == "Male", "Price"].mean()), ".2f")
M_avgpp

'3.02'

In [1182]:
ON_avgpp = format((df.loc[df["Gender"] == "Other / Non-Disclosed", "Price"].mean()), ".2f") 
ON_avgpp

'3.35'

In [1183]:
# Total purchase value by Gender

F_tpv = df.loc[df["Gender"] == "Female"]["Price"].sum()
F_tpv

361.94

In [1184]:
M_tpv = df.loc[df["Gender"] == "Male"]["Price"].sum()
M_tpv

1967.64

In [1185]:
ON_tpv = df.loc[df["Gender"] == "Other / Non-Disclosed"]["Price"].sum()
ON_tpv

50.19

In [1186]:
# Average purchase total per person by Gender

F_avgpt = format((F_tpv/F_count), ".2f")
F_avgpt

'4.47'

In [1187]:
M_avgpt = format((M_tpv/M_count), ".2f")
M_avgpt

'4.07'

In [1188]:
ON_avgpt = format((ON_tpv/ON_count), ".2f")
ON_avgpt

'4.56'

In [1189]:
# Saving results to new dataframe and printing table

panal_gender_df = pd.DataFrame({ 
                "Gender":["Female", "Male", "Other / Non-Disclosed"], 
                "Purchase Count":[F_pcount, M_pcount, ON_pcount], 
                "Average Purchase Price":[F_avgpp, M_avgpp, ON_avgpp],
                "Total Purchase Value":[F_tpv, M_tpv, ON_tpv],
                "Avg Total Purchase per Person":[F_avgpt, M_avgpt, ON_avgpt]})

df_indexed = panal_gender_df.set_index("Gender")
df_indexed["Average Purchase Price"] = df_indexed["Average Purchase Price"].apply(lambda avgpp: f"${avgpp}")
df_indexed["Total Purchase Value"] = df_indexed["Total Purchase Value"].apply(lambda tpv: f"${tpv}")
df_indexed["Avg Total Purchase per Person"] = df_indexed["Avg Total Purchase per Person"].apply(lambda avgpt: f"${avgpt}")
df_indexed

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.20,$361.94,$4.47
Male,652,$3.02,$1967.64,$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [1190]:
# Age demographics

df["Age"].min()

7

In [1191]:
df["Age"].max()

45

In [1192]:
# Average purchase price by Age Group

# Creating age categories by binning
df_bin = (pd.cut(df["Age"], bins=[5, 10, 15, 20, 25, 30, 35, 40, 46], labels=["<10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40+"], right=False))
df_bin

0      20-24
1        40+
2      20-24
3      20-24
4      20-24
       ...  
775    20-24
776    20-24
777    20-24
778      <10
779    20-24
Name: Age, Length: 780, dtype: category
Categories (8, object): [<10 < 10-14 < 15-19 < 20-24 < 25-29 < 30-34 < 35-39 < 40+]

In [1193]:
# Total count of players per Age group
play_ct_age = df.groupby(df_bin)["SN"].nunique()
play_ct_age 

Age
<10       17
10-14     22
15-19    107
20-24    258
25-29     77
30-34     52
35-39     31
40+       12
Name: SN, dtype: int64

In [1194]:
# Percentage of players per Age group
play_pt_age = round((((df.groupby(df_bin)["SN"].nunique()) / totplayers)*100), 2)
play_pt_age

Age
<10      2.95
10-14    3.82
15-19   18.58
20-24   44.79
25-29   13.37
30-34    9.03
35-39    5.38
40+      2.08
Name: SN, dtype: float64

In [1205]:
# Age demographic table

age_demo_df = pd.DataFrame({"Total Count":play_ct_age,
                     "Percentage of Players":play_pt_age})

age_demo_df2 = pd.DataFrame(age_demo_df)

age_demo_df2



Unnamed: 0_level_0,Total Count,Percentage of Players
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,17,2.95
10-14,22,3.82
15-19,107,18.58
20-24,258,44.79
25-29,77,13.37
30-34,52,9.03
35-39,31,5.38
40+,12,2.08


In [1196]:
# Purchase count by Age
pur_ct_age = df.groupby(df_bin)["Purchase ID"].nunique()

# Avg purchase price by Age
avg_pp_age = round((df.groupby(df_bin)["Price"].mean()), 2).apply(lambda avg_pp_age: f"${avg_pp_age}")
avg_pp_age

Age
<10      $3.35
10-14    $2.96
15-19    $3.04
20-24    $3.05
25-29     $2.9
30-34    $2.93
35-39     $3.6
40+      $2.94
Name: Price, dtype: object

In [1197]:
# Total purchase value by Age
tpv_age = (df.groupby(df_bin)["Price"].sum()).map("${:,.2f}".format)
tpv_age

Age
<10         $77.13
10-14       $82.78
15-19      $412.89
20-24    $1,114.06
25-29      $293.00
30-34      $214.00
35-39      $147.67
40+         $38.24
Name: Price, dtype: object

In [1198]:
# Number of people per age group
peop_age = df.groupby(df_bin)["SN"].nunique()

# Average purchase total per person by Age Group
avg_pp_age = ((df.groupby(df_bin)["Price"].sum())/peop_age).map("${:,.2f}".format)
avg_pp_age

Age
<10      $4.54
10-14    $3.76
15-19    $3.86
20-24    $4.32
25-29    $3.81
30-34    $4.12
35-39    $4.76
40+      $3.19
dtype: object

In [1199]:
# Purchasing analysis table (by Age)

panal_age_df = ({"Purchase Count":pur_ct_age, 
                "Average Purchase Price":avg_pp_age,
                "Total Purchase Value":tpv_age,
                "Avg Total Purchase per Person":avg_pp_age})


panal_age_df2 = pd.DataFrame(panal_age_df)

panal_age_df2

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,23,$4.54,$77.13,$4.54
10-14,28,$3.76,$82.78,$3.76
15-19,136,$3.86,$412.89,$3.86
20-24,365,$4.32,"$1,114.06",$4.32
25-29,101,$3.81,$293.00,$3.81
30-34,73,$4.12,$214.00,$4.12
35-39,41,$4.76,$147.67,$4.76
40+,13,$3.19,$38.24,$3.19


In [1200]:
# Top spenders

# Group purchase data by screen name
play_counts_df = pd.DataFrame(df.groupby("SN").count())

# Count total purchases by name
play_sum_df = pd.DataFrame(df.groupby("SN").sum())

spenders_df = play_sum_df
spenders_df = spenders_df.sort_values(["Price"], ascending=False)

spenders_df["Purchase Count"] = play_counts_df["Item ID"]
spenders_df["Average Purchase Price"] = round(spenders_df["Price"]/spenders_df["Purchase Count"],2).map("${:,.2f}".format)
spenders_df["Total Purchase Value"] = spenders_df["Price"].map("${:,.2f}".format)

spenders_df = spenders_df.drop(labels = ["Age","Item ID","Price"],axis = 1)

# Top 5 spenders
spenders_df.head(5)

Unnamed: 0_level_0,Purchase ID,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lisosia93,1630,5,$3.79,$18.96
Idastidru52,1999,4,$3.86,$15.45
Chamjask73,1306,3,$4.61,$13.83
Iral74,2285,4,$3.40,$13.62
Iskadarya95,713,3,$4.37,$13.10


In [None]:
# Total purchase value by Age

In [None]:
# Average purchase total per person by Age

In [None]:
# Top spenders

df_uniqbuyer = df['SN'].value_counts()
df_uniqbuyer

In [None]:
# Most popular items

# Retrieving relevant data

df_pop = df.drop(
        [
            "Purchase ID",
            "SN",
            "Age",
            "Gender",
        ],
        axis=1,
)

df_pop

In [None]:
df.groupby(["Item ID", "Item Name"])