### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [None]:
# Julie Baker
# Heroes of Pymoli
# June 2021

In [1]:
# Dependencies and Setup --> need to make sure it's on my 3.8.5 to get to work right now
import pandas as pd
import os
import csv

# File to Load (Remember to Change These)
file_to_load = os.path.join(os.getcwd(), "Resources", "purchase_data.csv")

# Read Purchasing File and store into Pandas data frame
purchase_df = pd.read_csv(file_to_load)

In [2]:
# just looking at data
purchase_df.head(5)

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


## Player Count

* Display the total number of players


In [7]:
# SN unique
user_count = len(purchase_df['SN'].unique())
print(f"Player count = {user_count}")

Player count = 576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [8]:
## WORK ON FORMATTING IF TIME LATER ##
# ===================================
# unique items
item_count = purchase_df["Item ID"].nunique()
# average purchase price
avg_purchase = purchase_df.Price.mean()
#avg_purchase['Average Price'] = avg_purchase['Average Price'].map("${:,.2}".format)
# total number of purchases
total_purchases = len(purchase_df['Purchase ID'])
# total revenue
total_rev = purchase_df.Price.sum()

purchase_analysis_df = pd.DataFrame(data = [[user_count, item_count, avg_purchase, total_purchases, total_rev]], columns=["Unique Users", "Unique Items", "Average Price", "Total Purchases", "Total Revenue"])

#pd.options.display.float_format = '${:, .2f}'.format

purchase_analysis_df

Unnamed: 0,Unique Users,Unique Items,Average Price,Total Purchases,Total Revenue
0,576,179,3.050987,780,2379.77


In [9]:
# Renaming columns to remove spaces 
update_purchase_df = purchase_df.rename(columns={'Purchase ID':'PurchaseID', 'Item ID' : 'ItemID', 'Item Name' : 'ItemName'})

## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [None]:
# creating dataframes to compile gender information

In [10]:
# gender count
gender_fixed_df = update_purchase_df.groupby(['Gender'])['SN'].nunique()


Gender
Female                    81
Male                     484
Other / Non-Disclosed     11
Name: SN, dtype: int64

In [11]:
#  purchase count by gender
gender_purch_df = update_purchase_df.groupby(['Gender'])['SN'].count()

Gender
Female                   113
Male                     652
Other / Non-Disclosed     15
Name: SN, dtype: int64

In [24]:
# first set of gender merges -- gender count & purchase count by gender
gender_compiling_df = pd.merge(gender_fixed_df, gender_purch_df, on='Gender')

Unnamed: 0_level_0,SN_x,SN_y
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,81,113
Male,484,652
Other / Non-Disclosed,11,15


In [13]:
# renaming current columns before merging more
gender_compiling_df = gender_compiling_df.rename(columns={'SN_x': 'Player Count', 'SN_y': 'Purchase Count'})

Unnamed: 0_level_0,Player Count,Purchase Count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,81,113
Male,484,652
Other / Non-Disclosed,11,15


In [14]:
# calculating percentage of each gender
gender_percent_df = gender_compiling_df['Percentage of Players'] = (gender_compiling_df['Player Count'] / user_count)*100

Gender
Female                   14.062500
Male                     84.027778
Other / Non-Disclosed     1.909722
Name: Player Count, dtype: float64

In [15]:
# ** GENDER DEMOGRAPHICS TABLE
gender_demographics_df = pd.merge(gender_fixed_df, gender_percent_df, on='Gender')
gender_demographics_df = gender_demographics_df.rename(columns={'SN':'Player Count', 'Player Count':'Percent of Players'})
gender_demographics_df

Unnamed: 0_level_0,Player Count,Percent of Players
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,81,14.0625
Male,484,84.027778
Other / Non-Disclosed,11,1.909722


In [None]:
# Next... building Purchasing Analysis (Total)

In [16]:
# average purchase price by gender
gender_price_df = update_purchase_df.groupby(['Gender'])['Price'].mean()

Gender
Female                   3.203009
Male                     3.017853
Other / Non-Disclosed    3.346000
Name: Price, dtype: float64

In [17]:
# total purchase value by gender
gender_value_df = update_purchase_df.groupby(['Gender'])['Price'].sum()

Gender
Female                    361.94
Male                     1967.64
Other / Non-Disclosed      50.19
Name: Price, dtype: float64

In [18]:
# first round of data merges
gender_compile_df2 = pd.merge(gender_compiling_df, gender_price_df, on='Gender')
gender_compile_df2

Unnamed: 0_level_0,Player Count,Purchase Count,Percentage of Players,Price
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,81,113,14.0625,3.203009
Male,484,652,84.027778,3.017853
Other / Non-Disclosed,11,15,1.909722,3.346


In [19]:
gender_compile_df3 = pd.merge(gender_compile_df2, gender_value_df, on='Gender')
gender_compile_df3

Unnamed: 0_level_0,Player Count,Purchase Count,Percentage of Players,Price_x,Price_y
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,81,113,14.0625,3.203009,361.94
Male,484,652,84.027778,3.017853,1967.64
Other / Non-Disclosed,11,15,1.909722,3.346,50.19


In [20]:
# rename columns
gender_compile_df3 = gender_compile_df3.rename(columns={'Price_x':'Average Purchase Price','Price_y':'Total Purchase Value'})
gender_compile_df3

Unnamed: 0_level_0,Player Count,Purchase Count,Percentage of Players,Average Purchase Price,Total Purchase Value
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,81,113,14.0625,3.203009,361.94
Male,484,652,84.027778,3.017853,1967.64
Other / Non-Disclosed,11,15,1.909722,3.346,50.19


In [21]:
# average purchase by person
gender_compile_df3['Average Purchase Total per Person'] = (gender_compile_df3['Total Purchase Value'] /gender_compile_df3['Player Count'])
gender_compile_df3

Unnamed: 0_level_0,Player Count,Purchase Count,Percentage of Players,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,81,113,14.0625,3.203009,361.94,4.468395
Male,484,652,84.027778,3.017853,1967.64,4.065372
Other / Non-Disclosed,11,15,1.909722,3.346,50.19,4.562727


In [22]:
# ** PURCHASING ANALYSIS (GENDER) **
gender_results_df = gender_compile_df3.drop(columns=['Player Count','Percentage of Players'])
gender_results_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,3.203009,361.94,4.468395
Male,652,3.017853,1967.64,4.065372
Other / Non-Disclosed,15,3.346,50.19,4.562727



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [None]:
# purchase count by gender
female_purchases = female_users_df["PurchaseID"].count()
male_purchases = male_users_df["PurchaseID"].count()
other_purchases = other_users_df["PurchaseID"].count()

In [None]:
# average purchase price by gender
avg_price_female = female_users_df.Price.mean()
avg_price_male = male_users_df.Price.mean()
avg_price_other = other_users_df.Price.mean()

In [None]:
# total purchase value by gender
total_rev = purchase_df.Price.sum()
total_purchase_female = female_users_df.Price.sum()
total_purchase_male = male_users_df.Price.sum()
total_purchase_other = other_users_df.Price.sum()

In [None]:
# average purchase total per PERSON per GENDER
avg_purchase = purchase_df.Price.mean()
avg_total_females = total_purchase_female/female_count
avg_total_males = total_purchase_male/male_count
avg_total_other = total_purchase_other/other_count


In [None]:
## PURCHASING ANALYSIS (Gender)

gender_purchasing_df = pd.DataFrame({
    "Gender": ["Female", "Male", "Other / Not Disclosed"],
    "Player Count": [female_count, male_count, other_count],
    "Percent of Players": [female_percent, male_percent, other_percent],
    "Purchase Count": [female_purchases, male_purchases, other_purchases],
    "Average Purchase Price": [avg_price_female, avg_price_male, avg_price_other],
    "Total Purchase Value": [total_purchase_female, total_purchase_male, total_purchase_other],
    "Average Purchase Total per Person by Gender": [avg_total_females, avg_total_males, avg_total_other]
})
gender_purchasing_df

## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [None]:
# esatblish bins for ages
bins = [0, 9, 14, 19, 24, 29, 34, 39, 44, 49]
age_labels = ["under 10", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49"]

In [None]:
# use pd.cut() to categorize players using age bins
age_df = update_purchase_df
age_df["Age Group"] = pd.cut(age_df["Age"], bins, labels=age_labels, include_lowest=False)
age_df

In [None]:
# age demographics; counts members for each age group & drops duplicates based on 'SN'
age_counts_df2 = age_df.groupby('Age Group')['SN'].nunique()

In [None]:
# AGE ANALYSIS NEEDS:  group count, percent of players
# I couldn't get it to add my calculated column to the df above, so I created this one & merged them. AND THEN...it was totally fine with me adding a calculated column.
age_counts_df3 = age_df.groupby('Age Group')['SN'].nunique()
age_demos_df = pd.merge(age_counts_df2, age_counts_df3, on='Age Group')

In [None]:
# calculate percentage of players by age
age_demos_df['Percentage of Players'] = (age_demos_df['SN_x'] / user_count)*100

In [None]:
# renaming the columns I want to keep then dropping the column I don't need
age_demos_df = age_demos_df.rename(columns={'SN_x': 'Player Count', 'Percentage of Players': 'Percentage of Players'})
age_demos_df = age_demos_df.drop(columns='SN_y')
age_demos_df


In [None]:
## Building purchase analysis next

In [None]:
# purchase count
purchase_byAge_df = age_df.groupby('Age Group')['SN'].count()

In [None]:
# avg purchase price
avg_byAge_df = age_df.groupby('Age Group')['Price'].mean()

In [None]:
# total purchase value
total_byAge_df = age_df.groupby('Age Group')['Price'].sum()

In [None]:
# creating compiled df for results
age_results_df = pd.merge(purchase_byAge_df, avg_byAge_df, on='Age Group')
age_results_df = pd.merge(age_results_df, total_byAge_df, on='Age Group')
age_results_df = age_results_df.rename(columns={'SN': 'Purchase Count', 'Price_x': 'Average Purchase', 'Price_y':'Total Purchase Value'})
age_results_df

In [None]:
# adding the calculated column for average purchase total per person
age_results_df['Average Total per Person'] = (age_results_df['Total Purchase Value'] / age_demos_df['Player Count'])
age_results_df

## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [None]:
# Compile all summary data
print(grp_10_results_df)

In [None]:
# display Purchasing Analysis (Age)

## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [None]:
# Find Top 5 Spenders by total purchase value

# creating df for finding Top 5 Spenders
top_spenders_df = update_purchase_df[['SN','Price']]
top_spenders_df = top_spenders_df.groupby('SN')

In [None]:
# ** Total Spent **
# This new df contains the sum of the prices for each player
spend_compare_df = top_spenders_df.sum()

# This df saves the top 5 SPENDERS by sorting price as descending.
top5_spend_df = spend_compare_df.sort_values(['Price'],ascending=False).head(5)
top5_spend_df

In [None]:
## ** Number of Purchases **
# This df contains the number of transactions for each player
purchase_counts_df = top_spenders_df.count()
purchase_counts_df

In [None]:
# Join top 5 spenders to their purchase counts via inner join
top5_hold_df = pd.merge(top5_spend_df, purchase_counts_df, on='SN')
top5_hold_df

In [None]:
# Renaming columns
top5_hold_df2 = top5_hold_df.rename(columns={'Price_x': 'Total Purchase Value', 'Price_y': 'Purchase Count'})
top5_hold_df2

In [None]:
# create a new column with calculated avg price per transaction
top5_hold_df2['Average Purchase Price'] = (top5_hold_df2['Total Purchase Value'] / top5_hold_df2['Purchase Count'])
top5_hold_df2

In [None]:
## TOP SPENDERS ##
# Table should have: SN, purchase count, average purchase price, total purchase value
top5_final_df = top5_hold_df2[['Purchase Count', 'Average Purchase Price', 'Total Purchase Value']].head(5)
top5_final_df

## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, average item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [None]:
# create df for items
item_df = update_purchase_df[['ItemID', 'ItemName', 'Price']]
item_df = item_df.groupby(['ItemID', 'ItemName'])

item_df.head()

In [None]:
# purchase count sorted descending
item_counts_df = item_df.count()
item_counts_df = item_counts_df.sort_values(['Price'],ascending=False)
item_counts_df

In [None]:
# average item price
item_avg_df = item_df.mean()
item_avg_df

In [None]:
# total purchase value
item_values_df = item_df.sum()
item_values_df

In [None]:
# merge counts to avg
item_hold_df = pd.merge(item_counts_df, item_avg_df, on=['ItemID', 'ItemName'],how='outer')
item_hold_df.head()

In [None]:
# merge total values to others
item_hold2_df = pd.merge(item_hold_df, item_values_df, on=['ItemID', 'ItemName'],how='outer')
item_hold2_df

In [None]:
# rename columns
item_analysis_df = item_hold2_df.rename(columns={'Price_x': 'Purchase Count', 'Price_y': 'Item Price', 'Price': 'Total Purchase Value'})
item_analysis_df

In [None]:
# Most Popular Items Table
top5_items_df = item_analysis_df.head(5)
top5_items_df

## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame



In [None]:
# Resorting item_analysis_df by purchase value rather than item count
item_profit_df = item_analysis_df.sort_values(['Total Purchase Value'],ascending=False)
item_profit_df

In [None]:
# Table for Most Profitable Items
top5_profit_df = item_profit_df.head(5)
top5_profit_df