### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [None]:
# Julie Baker
# Heroes of Pymoli
# June 2021

In [58]:
# Dependencies and Setup --> need to make sure it's on my 3.8.5 to get to work right now
import pandas as pd
import os
import csv

# File to Load (Remember to Change These)
file_to_load = os.path.join(os.getcwd(), "Resources", "purchase_data.csv")

# Read Purchasing File and store into Pandas data frame
purchase_df = pd.read_csv(file_to_load)

In [59]:
# just looking at data
purchase_df.head(5)

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


## Player Count

* Display the total number of players


In [60]:
# SN unique
user_count = len(purchase_df['SN'].unique())
print(user_count)

576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [61]:
## WORK ON FORMATTING IF TIME LATER ##
# ===================================
# unique items
item_count = len(purchase_df["Item ID"].unique())
# average purchase price
avg_purchase = purchase_df.Price.mean()
#.. avg_purchase['Average Price'] = avg_purchase['Average Price'].map("${:,.2}".format)
# total number of purchases
total_purchases = len(purchase_df['Purchase ID'])
# total revenue
total_rev = purchase_df.Price.sum()

purchase_analysis_df = pd.DataFrame(data = [[user_count, item_count, avg_purchase, total_purchases, total_rev]], columns=["Unique Users", "Unique Items", "Average Price", "Total Purchases", "Total Revenue"])

### pd.options.display.float_format = '${:, .2f}'.format

purchase_analysis_df

Unnamed: 0,Unique Users,Unique Items,Average Price,Total Purchases,Total Revenue
0,576,179,3.050987,780,2379.77


In [62]:
# Renaming columns to remove spaces 
update_purchase_df = purchase_df.rename(columns={'Purchase ID':'PurchaseID', 'Item ID' : 'ItemID', 'Item Name' : 'ItemName'})

update_purchase_df

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46
778,778,Sisur91,7,Male,92,Final Critic,4.19


## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [63]:
# gender demographics; based on code example from stackoverflow.com
gender_fixed = update_purchase_df.groupby(['SN','Gender'])['Gender'].count()

## THERE IS A PROBLEM WITH THIS -- it's based on all lines of data (thus summing to 780), but should just be subset based on user id (which is 576) to remove duplicates--discrepancy can be seen between these values and those generated from the newly created dataframes below
gender_fixed

SN             Gender
Adairialis76   Male      1
Adastirin33    Female    1
Aeda94         Male      1
Aela59         Male      1
Aelaria33      Male      1
                        ..
Yathecal82     Female    3
Yathedeu43     Male      2
Yoishirrala98  Female    1
Zhisrisu83     Male      2
Zontibe81      Male      3
Name: Gender, Length: 576, dtype: int64

In [64]:
# Creating dataframes for each gender group because using the original dataframe doesn't take duplicate users into account

# Female users
female_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Female", :]
# Male users
male_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Male", :]
# Other users
other_users_df = update_purchase_df.loc[update_purchase_df["Gender"] == "Other / Non-Disclosed", :]

In [65]:
# female count
female_count = len(female_users_df.SN.unique())
# female percentage
female_percent = (female_count / user_count)*100

In [66]:
# male count & percentage
male_count = len(male_users_df.SN.unique())
male_percent = male_count / user_count*100

In [67]:
# other count & percentage
other_count = len(other_users_df.SN.unique())
other_percent = (other_count / user_count)*100

In [68]:
# Gender Demographics Output

gender_demo_df = pd.DataFrame({
    "Gender": ["Female", "Male", "Other / Not Disclosed", "Total Players"],
    "Player Count": [female_count, male_count, other_count, user_count],
    "Percent of Players": [female_percent, male_percent, other_percent, 100]
})

# gender_demo_df["Percent of Players"] = gender_demo_df["Percent of Players"].map('{:.2%f}')
# pd.options.display.float_format = '{:.2%}'.format FIGURE THIS OUT LATER, WAS MESSING UP STUFF LATER

#purchase_analysis_df = pd.DataFrame(data = [[user_count, item_count, avg_purchase, total_purchases, total_rev]], columns=["Unique Users", "Unique Items", "Average Price", "Total Purchases", "Total Revenue"])
gender_demo_df

Unnamed: 0,Gender,Player Count,Percent of Players
0,Female,81,14.0625
1,Male,484,84.027778
2,Other / Not Disclosed,11,1.909722
3,Total Players,576,100.0



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [69]:
# purchase count by gender
female_purchases = female_users_df["PurchaseID"].count()
male_purchases = male_users_df["PurchaseID"].count()
other_purchases = other_users_df["PurchaseID"].count()

In [70]:
# average purchase price by gender
avg_price_female = female_users_df.Price.mean()
avg_price_male = male_users_df.Price.mean()
avg_price_other = other_users_df.Price.mean()

In [71]:
# total purchase value by gender
total_rev = purchase_df.Price.sum()
total_purchase_female = female_users_df.Price.sum()
total_purchase_male = male_users_df.Price.sum()
total_purchase_other = other_users_df.Price.sum()

In [72]:
# average purchase total per PERSON per GENDER
avg_purchase = purchase_df.Price.mean()
avg_total_females = total_purchase_female/female_count
avg_total_males = total_purchase_male/male_count
avg_total_other = total_purchase_other/other_count


In [73]:
## PURCHASING ANALYSIS (Gender)

gender_purchasing_df = pd.DataFrame({
    "Gender": ["Female", "Male", "Other / Not Disclosed"],
    "Player Count": [female_count, male_count, other_count],
    "Percent of Players": [female_percent, male_percent, other_percent],
    "Purchase Count": [female_purchases, male_purchases, other_purchases],
    "Average Purchase Price": [avg_price_female, avg_price_male, avg_price_other],
    "Total Purchase Value": [total_purchase_female, total_purchase_male, total_purchase_other],
    "Average Purchase Total per Person by Gender": [avg_total_females, avg_total_males, avg_total_other]
})
gender_purchasing_df

Unnamed: 0,Gender,Player Count,Percent of Players,Purchase Count,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person by Gender
0,Female,81,14.0625,113,3.203009,361.94,4.468395
1,Male,484,84.027778,652,3.017853,1967.64,4.065372
2,Other / Not Disclosed,11,1.909722,15,3.346,50.19,4.562727


## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [74]:
# esatblish bins for ages
bins = [0, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
age_labels = ["10 and under", "11-14", "15-18", "19-22", "23-26", "27-30", "31-34", "35-38", "39-42", "43-46"]

In [75]:
# use pd.cut() to categorize players using age bins
age_df = update_purchase_df
age_df["Age Group"] = pd.cut(age_df["Age"], bins, labels=age_labels)
age_df

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price,Age Group
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,19-22
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,39-42
2,2,Ithergue48,24,Male,92,Final Critic,4.88,23-26
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27,23-26
4,4,Iskosia90,23,Male,131,Fury,1.44,23-26
...,...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54,19-22
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63,19-22
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46,19-22
778,778,Sisur91,7,Male,92,Final Critic,4.19,10 and under


In [76]:
# Using the groupby function on this data per the rubric--but again, because it doesn't account for duplicate purchases by a user it isn't accurate, so the values in my tables will come from other calculations

age_duplicates = age_df.groupby(['Age Group'])['SN'].nunique()
age_duplicates

Age Group
10 and under     24
11-14            15
15-18            90
19-22           178
23-26           151
27-30            48
31-34            27
35-38            25
39-42            14
43-46             4
Name: SN, dtype: int64

In [77]:
# Creating dataframes for each age group

group01_df = age_df.loc[update_purchase_df["Age Group"] == "10 and under", :]
group02_df = age_df.loc[update_purchase_df["Age Group"] == "11-14", :]
group03_df = age_df.loc[update_purchase_df["Age Group"] == "15-18", :]
group04_df = age_df.loc[update_purchase_df["Age Group"] == "19-22", :]
group05_df = age_df.loc[update_purchase_df["Age Group"] == "23-26", :]
group06_df = age_df.loc[update_purchase_df["Age Group"] == "27-30", :]
group07_df = age_df.loc[update_purchase_df["Age Group"] == "31-34", :]
group08_df = age_df.loc[update_purchase_df["Age Group"] == "35-38", :]
group09_df = age_df.loc[update_purchase_df["Age Group"] == "39-42", :]
group10_df = age_df.loc[update_purchase_df["Age Group"] == "43-46", :]

In [78]:
# age group counts
grp01_users = len(group01_df['SN'].unique())
grp02_users = len(group02_df['SN'].unique())
grp03_users = len(group03_df['SN'].unique())
grp04_users = len(group04_df['SN'].unique())
grp05_users = len(group05_df['SN'].unique())
grp06_users = len(group06_df['SN'].unique())
grp07_users = len(group07_df['SN'].unique())
grp08_users = len(group08_df['SN'].unique())
grp09_users = len(group09_df['SN'].unique())
grp10_users = len(group10_df['SN'].unique())

age_counts = [grp01_users, grp02_users, grp03_users, grp04_users, grp05_users, grp06_users, grp07_users, grp08_users, grp09_users, grp10_users]

In [79]:
# age group percents
grp01_percent = grp01_users / user_count *100
grp02_percent = grp02_users / user_count *100
grp03_percent = grp03_users / user_count *100
grp04_percent = grp04_users / user_count *100
grp05_percent = grp05_users / user_count *100
grp06_percent = grp06_users / user_count *100
grp07_percent = grp07_users / user_count *100
grp08_percent = grp08_users / user_count *100
grp09_percent = grp09_users / user_count *100
grp10_percent = grp10_users / user_count *100

age_percents = [grp01_percent, grp02_percent, grp03_percent, grp04_percent, grp05_percent, grp06_percent, grp07_percent, grp08_percent, grp09_percent, grp10_percent]

In [80]:
# create a summary df to hold the results; opt: round percentage to two decimals
age_demo_df = pd.DataFrame({
    "Age": age_labels,
    "Player Count": age_counts,
    "Percent of Players": age_percents
})

In [81]:
# display Age Demographics Table
age_demo_df

Unnamed: 0,Age,Player Count,Percent of Players
0,10 and under,24,4.166667
1,11-14,15,2.604167
2,15-18,90,15.625
3,19-22,178,30.902778
4,23-26,151,26.215278
5,27-30,48,8.333333
6,31-34,27,4.6875
7,35-38,25,4.340278
8,39-42,14,2.430556
9,43-46,4,0.694444


## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [82]:
# calculate the numbers & percentages by age group: purchase count, avg purchase price, total purchase value, avg purchase total per person by age
###  NEED TO FIGURE OUT HOW TO SAVE THIS IN A WAY THAT OUTPUT CAN BE CALLED -OUTSIDE- THE FUNCTION SO CAN CONCATENATE ALL RESULTS

def analyze_purchases(your_frame):
    your_group = your_frame["SN"]
    purchase_count = len(your_frame['PurchaseID'])
    avg_price = your_frame.Price.mean()
    total_value = your_frame.Price.sum()
    users_in_grp = your_frame['SN'].nunique()
    avg_total_grp = total_value / users_in_grp
    your_output_df = pd.DataFrame({
        "Age Group": [your_group],
        "Users in Group": [users_in_grp],
        "Purchase Count": [purchase_count],
        "Average Purchase Price": [avg_price],
        "Total Purchase Value": [total_value],
        "Average Purchase Total per Person by Age": [avg_total_grp] 
        })
    #print(purchase_count, avg_price, total_value, users_in_grp, avg_total_grp)
    return your_output_df
    # export as CSV to read in later? TRY AGAIN ON THIS LATER
    # your_output_df.to_csv(os.path.join(os.getcwd(),"HeroesOfPymoli", "Output","fileOut.csv"))

In [83]:
### Why does my output_df not look like a df?
### Why am I getting multiple lines on this? 
grp_10_results_df = analyze_purchases(group10_df)
grp_10_results_df

Unnamed: 0,Age Group,Users in Group,Purchase Count,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person by Age
0,248 Isursuir31 674 Aeral68 728 ...,4,4,2.765,11.06,2.765


In [84]:
# Compile all summary data
print(grp_10_results_df)

                                           Age Group  Users in Group  \
0  248      Isursuir31
674         Aeral68
728   ...               4   

   Purchase Count  Average Purchase Price  Total Purchase Value  \
0               4                   2.765                 11.06   

   Average Purchase Total per Person by Age  
0                                     2.765  


In [85]:
# display Purchasing Analysis (Age)

## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [94]:
update_purchase_df.sort_values(['SN','Price'])

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price,Age Group
467,467,Adairialis76,16,Male,123,Twilight's Carver,2.28,15-18
142,142,Adastirin33,35,Female,175,Woeful Adamantite Claymore,4.48,35-38
388,388,Aeda94,17,Male,128,"Blazeguard, Reach of Eternity",4.91,15-18
28,28,Aela59,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",4.32,19-22
630,630,Aelaria33,23,Male,171,Scalpel,1.79,23-26
...,...,...,...,...,...,...,...,...
141,141,Zhisrisu83,10,Male,60,Wolf,3.54,10 and under
54,54,Zhisrisu83,10,Male,25,Hero Cane,4.35,10 and under
17,17,Zontibe81,21,Male,161,Devine,1.76,19-22
560,560,Zontibe81,21,Male,2,Verdict,2.48,19-22


In [95]:
update_purchase_df.sort_values(['SN','Price'], ascending=[False, True])

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price,Age Group
17,17,Zontibe81,21,Male,161,Devine,1.76,19-22
560,560,Zontibe81,21,Male,2,Verdict,2.48,19-22
442,442,Zontibe81,21,Male,84,Arcane Gem,3.79,19-22
141,141,Zhisrisu83,10,Male,60,Wolf,3.54,10 and under
54,54,Zhisrisu83,10,Male,25,Hero Cane,4.35,10 and under
...,...,...,...,...,...,...,...,...
630,630,Aelaria33,23,Male,171,Scalpel,1.79,23-26
28,28,Aela59,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",4.32,19-22
388,388,Aeda94,17,Male,128,"Blazeguard, Reach of Eternity",4.91,15-18
142,142,Adastirin33,35,Female,175,Woeful Adamantite Claymore,4.48,35-38


In [117]:
# Find Top 5 Spenders by total purchase value
# 1. need to determine value each person spent; 2. need to create list of top 5 spenders; 3. need to pull number of transactions for those users; 4. need to calculate average price per transaction

top_spenders_df = update_purchase_df.groupby('SN')
test_spenders_df = update_purchase_df.sort_values(['SN','Price']).Price.sum()

top_spenders_df.groups


{'Adairialis76': [467], 'Adastirin33': [142], 'Aeda94': [388], 'Aela59': [28], 'Aelaria33': [630], 'Aelastirin39': [218, 766], 'Aelidru27': [705], 'Aelin32': [52, 87, 584], 'Aelly27': [43, 428], 'Aellynun67': [286], 'Aellyria80': [746], 'Aelollo59': [203, 431], 'Aenarap34': [183], 'Aeral43': [263], 'Aeral68': [674], 'Aeral97': [167], 'Aeralria27': [32], 'Aeralstical35': [583], 'Aeri84': [303], 'Aerillorin70': [381], 'Aerithllora36': [274, 578], 'Aerithnucal56': [324], 'Aerithnuphos61': [517], 'Aerithriaphos45': [40], 'Aerithriaphos46': [236], 'Aesri53': [541], 'Aesty53': [124, 253], 'Aestysu37': [309, 416], 'Aesur96': [468], 'Aesurstilis64': [405], 'Aethedru70': [775], 'Aidai53': [219], 'Aidai61': [282], 'Aidai73': [720], 'Aidaillodeu39': [42, 146], 'Aidain51': [516], 'Aidaira26': [730], 'Aiduecal76': [97, 670], 'Aiduesu86': [429], 'Aillyriadru65': [523], 'Aillyrin83': [155], 'Aina42': [83, 363, 373], 'Aina43': [94], 'Airi27': [637], 'Aisur51': [184], 'Aisurdru79': [440, 618], 'Aisurri

In [118]:
top_spenders_df.head()

Unnamed: 0,PurchaseID,SN,Age,Gender,ItemID,ItemName,Price,Age Group
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,19-22
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56,39-42
2,2,Ithergue48,24,Male,92,Final Critic,4.88,23-26
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27,23-26
4,4,Iskosia90,23,Male,131,Fury,1.44,23-26
...,...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54,19-22
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63,19-22
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46,19-22
778,778,Sisur91,7,Male,92,Final Critic,4.19,10 and under


In [123]:
spend_compare_df = top_spenders_df.sum()
spend_compare_df.sort_values(['Price'], ascending=False).head(10)

Unnamed: 0_level_0,PurchaseID,Age,ItemID,Price
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lisosia93,1630,125,442,18.96
Idastidru52,1999,96,527,15.45
Chamjask73,1306,66,339,13.83
Iral74,2285,84,518,13.62
Iskadarya95,713,60,321,13.1
Ilarin91,1474,66,243,12.7
Ialallo29,1097,45,323,11.84
Tyidaim51,1257,48,153,11.83
Lassilsala30,1390,63,288,11.51
Chadolyla44,1000,60,306,11.46


In [88]:
#top_spenders_df.size()

SN
Adairialis76     1
Adastirin33      1
Aeda94           1
Aela59           1
Aelaria33        1
                ..
Yathecal82       3
Yathedeu43       2
Yoishirrala98    1
Zhisrisu83       2
Zontibe81        3
Length: 576, dtype: int64

In [96]:
top_spenders_df.sort_values('ItemID')

AttributeError: 'DataFrameGroupBy' object has no attribute 'sort_values'

In [46]:
# 1. determine value each person spent
Top_5_price_df = age_df.groupby(['SN'])['Price'].sum(['Price'])
Top_5_price_df.head()
#test_this = Top_5_df.groupby('Price')

SN
Adairialis76    2.28
Adastirin33     4.48
Aeda94          4.91
Aela59          4.32
Aelaria33       1.79
Name: Price, dtype: float64

In [None]:
# 2. find top 5 spenders
#..does_this_work_df = Top_5_price_df.loc[Top_5_price_df["Price"] > Top_5_price['Price'].mean(), :]

# Am I going to have to do a loop to search for top spender, pull their data, remove them from the df & then rerun 4 times to get the top 5?
##.top_5_SN

In [None]:
### If I can get this to work, I can do a merge between this & the earlier subset to pull from instead of having to cobble separately.

# 3. pull number of transactions for top 5
top_5_trans_df = age_df.groupby(['SN'])['ItemName'].count()
top_5_trans_df

In [None]:
# 4. calculate avg price per transaction

In [None]:
# Table should have: SN, purchase count, average purchase price, total purchase value

## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, average item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [None]:
# create df for items
item_df = update_purchase_df

## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame

