### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [None]:
# Julie Baker
# Heroes of Pymoli
# June 2021

In [1]:
# Dependencies and Setup --> need to make sure it's on my 3.8.5 to get to work right now
import pandas as pd
import os
import csv

# File to Load (Remember to Change These)
file_to_load = os.path.join(os.getcwd(), "Resources", "purchase_data.csv")

# Read Purchasing File and store into Pandas data frame
purchase_df = pd.read_csv(file_to_load)

In [2]:
# just looking at data
purchase_df.head(5)

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


## Player Count

* Display the total number of players


In [3]:
# SN unique
user_count = len(purchase_df.SN.unique())
print(user_count)

576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [4]:
## WORK ON FORMATTING IF TIME LATER ##
# ===================================
# unique items
item_count = len(purchase_df["Item ID"].unique())
# average purchase price
avg_purchase = purchase_df.Price.mean()
#.. avg_purchase['Average Price'] = avg_purchase['Average Price'].map("${:,.2}".format)
# total number of purchases
total_purchases = len(purchase_df['Purchase ID'])
# total revenue
total_rev = purchase_df.Price.sum()

purchase_analysis_df = pd.DataFrame(data = [[user_count, item_count, avg_purchase, total_purchases, total_rev]], columns=["Unique Users", "Unique Items", "Average Price", "Total Purchases", "Total Revenue"])

### pd.options.display.float_format = '${:, .2f}'.format

purchase_analysis_df

Unnamed: 0,Unique Users,Unique Items,Average Price,Total Purchases,Total Revenue
0,576,179,3.050987,780,2379.77


In [5]:
# Renaming columns to remove spaces 
update_purchase_df = purchase_df.rename(columns={'Purchase ID':'PurchaseID', 'Item ID' : 'ItemID', 'Item Name' : 'ItemName'})

update_purchase_df

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46
778,778,Sisur91,7,Male,92,Final Critic,4.19


## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [6]:
# gender demographics; based on code example from stackoverflow.com
gender_incorrect = update_purchase_df.groupby(['Gender'])['Gender'].count()

## THERE IS A PROBLEM WITH THIS -- it's based on all lines of data, but should just be subset based on user id to remove duplicates--discrepancy can be seen between these values and those generated from the newly created dataframes below
gender_incorrect

Gender
Female                   113
Male                     652
Other / Non-Disclosed     15
Name: Gender, dtype: int64

In [7]:
# Creating dataframes for each gender group because using the original dataframe doesn't take duplicate users into account

# Female users
female_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Female", :]
# Male users
male_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Male", :]
# Other users
other_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Other / Non-Disclosed", :]

In [8]:
# female count
female_count = len(female_users_df.SN.unique())
# female percentage
female_percent = (female_count / user_count)*100
print(female_count, female_percent)

81 14.0625


In [9]:
# male count & percentage
male_count = len(male_users_df.SN.unique())
male_percent = male_count / user_count*100
print(male_count, male_percent)

484 84.02777777777779


In [10]:
# other count & percentage
other_count = len(other_users_df.SN.unique())
other_percent = (other_count / user_count)*100
print(other_count, other_percent)

11 1.9097222222222223


In [14]:
# Gender Demographics Output

gender_demo_df = pd.DataFrame({
    "Gender": ["Female", "Male", "Other / Not Disclosed", "Total Players"],
    "Player Count": [female_count, male_count, other_count, user_count],
    "Percent of Players": [female_percent, male_percent, other_percent, "100"]
})

# pd.options.display.float_format = '{:.2%}'.format FIGURE THIS OUT LATER, WAS MESSING UP STUFF LATER

#purchase_analysis_df = pd.DataFrame(data = [[user_count, item_count, avg_purchase, total_purchases, total_rev]], columns=["Unique Users", "Unique Items", "Average Price", "Total Purchases", "Total Revenue"])
gender_demo_df

Unnamed: 0,Gender,Player Count,Percent of Players
0,Female,81,14.0625
1,Male,484,84.0278
2,Other / Not Disclosed,11,1.90972
3,Total Players,576,100.0



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [15]:
## NEED NEW DF THAT CAN FILTER BY PERSON -- IS THIS SLICING?
item_count = len(purchase_df["Item ID"].unique())
# average purchase price
avg_purchase = purchase_df.Price.mean()
#.. avg_purchase['Average Price'] = avg_purchase['Average Price'].map("${:,.2}".format)
# total number of purchases
total_purchases = len(purchase_df['Purchase ID'])
# total revenue
total_rev = purchase_df.Price.sum()

In [16]:
# purchase count by gender
female_purchases = female_users_df["PurchaseID"].count()
male_purchases = male_users_df["PurchaseID"].count()
other_purchases = other_users_df["PurchaseID"].count()
print(female_purchases, male_purchases, other_purchases)

113 652 15


In [17]:
# average purchase price by gender
avg_price_female = female_users_df.Price.mean()
avg_price_male = male_users_df.Price.mean()
avg_price_other = other_users_df.Price.mean()
print(avg_price_female, avg_price_male, avg_price_other)

3.203008849557519 3.0178527607361953 3.3460000000000005


In [18]:
# total purchase value by gender
total_rev = purchase_df.Price.sum()
total_purchase_female = female_users_df.Price.sum()
total_purchase_male = male_users_df.Price.sum()
total_purchase_other = other_users_df.Price.sum()
print(total_purchase_female, total_purchase_male, total_purchase_other)

361.94 1967.64 50.19


In [21]:
# average purchase total per PERSON per GENDER
avg_purchase = purchase_df.Price.mean()
avg_total_females = total_purchase_female/female_count
avg_total_males = total_purchase_male/male_count
avg_total_other = total_purchase_other/other_count
print(avg_total_females, avg_total_males, avg_total_other)


4.468395061728395 4.065371900826446 4.5627272727272725


In [23]:
## PURCHASING ANALYSIS (Gender)

gender_purchasing_df = pd.DataFrame({
    "Gender": ["Female", "Male", "Other / Not Disclosed"],
    "Player Count": [female_count, male_count, other_count],
    "Percent of Players": [female_percent, male_percent, other_percent],
    "Purchase Count": [female_purchases, male_purchases, other_purchases],
    "Average Purchase Price": [avg_price_female, avg_price_male, avg_price_other],
    "Total Purchase Value": [total_purchase_female, total_purchase_male, total_purchase_other],
    "Average Purchase Total per Person by Gender": [avg_total_females, avg_total_males, avg_total_other]
})
gender_purchasing_df

Unnamed: 0,Gender,Player Count,Percent of Players,Purchase Count,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person by Gender
0,Female,81,14.0625,113,3.203009,361.94,4.468395
1,Male,484,84.027778,652,3.017853,1967.64,4.065372
2,Other / Not Disclosed,11,1.909722,15,3.346,50.19,4.562727


## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [34]:
# esatblish bins for ages

bins = [0, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
age_labels = ["10 and under", "11-14", "15-18", "19-22", "23-26", "27-30", "31-34", "35-38", "39-42", "43-46"]

In [35]:
# use pd.cut() to categorize players using age bins
pd.cut(update_purchase_df["Age"], bins, labels=age_labels).head()

age_df = update_purchase_df
age_df["Age Group"] = pd.cut(age_df["Age"], bins, labels=age_labels)
age_df

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price,Age Group
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,19-22
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,39-42
2,2,Ithergue48,24,Male,92,Final Critic,4.88,23-26
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27,23-26
4,4,Iskosia90,23,Male,131,Fury,1.44,23-26
...,...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54,19-22
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63,19-22
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46,19-22
778,778,Sisur91,7,Male,92,Final Critic,4.19,10 and under


In [43]:
# calculate the numbers & percentages by age group: purchase count, avg purchase price, total purchase value, avg purchase total per person by age
def analyze_purchases(your_frame):
    purchase_count = len(your_frame['PurchaseID'])
    avg_price = your_frame.Price.mean()
    total_value = your_frame.Price.sum()
    users_in_grp = len(your_frame['SN'].unique())
    avg_total_grp = total_value / users_in_grp

    print(purchase_count, avg_price, total_value, users_in_grp, avg_total_grp)

In [44]:
analyze_purchases(age_df)

780 3.050987179487176 2379.77 576 4.131545138888889


In [None]:
## NEED NEW DF THAT CAN FILTER BY PERSON -- IS THIS SLICING?
item_count = len(purchase_df["Item ID"].unique())
# average purchase price
avg_purchase = purchase_df.Price.mean()
#.. avg_purchase['Average Price'] = avg_purchase['Average Price'].map("${:,.2}".format)
# total number of purchases
total_purchases = len(purchase_df['Purchase ID'])
# total revenue
total_rev = purchase_df.Price.sum()

In [None]:
# create a summary df to hold the results; opt: round percentage to two decimals

In [None]:
# display Age Demographics Table

## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, average item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame

