In [34]:
# ----------------------------------
# Step 0: Import modules
# ----------------------------------
import pandas as pd

In [35]:
# ----------------------------------
# Step 1: Get the filename, then Read in and clean data.
# ----------------------------------

# Give the user instructions
print("Welcome to Pymoli Data Analysis!")
print("To analyze a file with purchase data, please enter the filename below.")
print("WARNING: Please include the complete filepath! \n")

filename = input("Enter filename here: ")

Welcome to Pymoli Data Analysis!
To analyze a file with purchase data, please enter the filename below.

Enter filename here: data/purchase_data.json


In [155]:
# Read in the data file
purchases_df = pd.read_json(filename, orient='records')

# make sure everything is the right data type to use
purchases_df['Price']= purchases_df['Price'].replace("%","", regex=True).astype(float)

# check and remove null values
purchases_df.dropna(how='any')
purchases_df.head()

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46
2,34,Male,174,Primitive Blade,2.46,Assastnya25
3,21,Male,92,Final Critic,1.36,Pheusrical25
4,23,Male,63,Stormfury Mace,1.27,Aela59


In [44]:
# ----------------------------------
# Step 2: Analyze the players: Look at 
# Total number of players & Gender demographics
# ----------------------------------

# Total Number of players
num_players_total = len(purchases_df.groupby('SN').count())

# count and % of male players
num_players_male = purchases_df[purchases_df['Gender']=='Male'].count()['SN']
percent_players_male = num_players_male/num_players_total

# count and % of female players
num_players_female = purchases_df[purchases_df['Gender']=='Female'].count()['SN']
percent_players_female = num_players_female/num_players_total

# count and % of other/non-disclosed players
num_players_other = purchases_df[purchases_df['Gender']=='Other / Non-Disclosed'].count()['SN']
percent_players_other = num_players_other/num_players_total

# ----------------------------------
# End Step 2
# ----------------------------------

In [119]:
# ----------------------------------
# Step 3: Purchasing Analysis
# 1) Top Spenders
# 2) Purchasing Analysis (Total)
# 3) Purchasing Analysis (Gender)
# 4) Age Demographics
# ----------------------------------

# ----------------------------------
# Question 1 - Top Spenders
# ----------------------------------

# identify top five spender by total puchase value
purchases_byPurchaseValue_df = purchases_df.groupby('SN').sum().sort_values('Price',ascending=False)
top_five_spenders_bySN = list(purchases_byPurchaseValue_df[0:5].reset_index()['SN'])

# using the screen names of the top five, get their purchase count, average, and total
top_five_purchase_count = list(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                               groupby('SN').count()['Age'])
top_five_purchase_avg = list(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                             groupby('SN').mean()['Price'])
top_five_purchase_total = list(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                               groupby('SN').sum()['Price'])

# list in a table the top five's SN, purchase count, average purchase price, total purchase value
top_spenders_analysis_df = pd.DataFrame({'Screen Name':top_five_spenders_bySN,\
                                         'Purchase Count':top_five_purchase_count,\
                                         'Average Purchase Price':['${:,.2f}'.format(x) for x in top_five_purchase_avg],\
                                         'Total Amount Purchased':['${:,.2f}'.format(x) for x in top_five_purchase_total]}).\
                                        sort_values('Total Amount Purchased', ascending=False)
    
# rerrange the columns in a logical order
top_spenders_analysis_df = top_spenders_analysis_df.set_index('Screen Name')
top_spenders_analysis_df = top_spenders_analysis_df[['Purchase Count', 'Average Purchase Price',\
                                                     'Total Amount Purchased']]

## Purchasing Analysis (Total)

- Number of Unique Items
- Average Purchase Price
- Total Number of Purchases
- Total Revenue

In [170]:
# ----------------------------------
# Question 2 - Purchasing Analysis (Total)
# ----------------------------------

# number of unique items purchased
num_items_unique = len(purchases_df['Item ID'].value_counts())

# average purchase price
overall_purchase_avg = '${:,.2f}'.format(purchases_df.mean()['Price'])

# total number of purchases
overall_purchase_count = purchases_df.count()['Item ID']

# total revenue
overall_purchase_revenue = '${:,.2f}'.format(purchases_df['Price'].sum())

# create dataframe to display results
purchasing_total_analysis_df = pd.DataFrame({'Number of Unique Items Purchased':[num_items_unique],\
                                             'Average Purchase Total':[overall_purchase_avg],\
                                             'Total Number of Purchases':[overall_purchase_count],\
                                             'Total Revenue':[overall_purchase_revenue]})

Unnamed: 0,Average Purchase Total,Number of Unique Items Purchased,Total Number of Purchase,Total Revenue
0,$2.93,183,780,"$2,286.33"


## Purchasing Analysis (Gender)

- The below each broken by gender
 - Purchase Count
 - Average Purchase Price
 - Total Purchase Value
 - Normalized Totals
 - Age Demographics

In [None]:
# ----------------------------------
# Question 3 - Purchasing Analysis (Gender)
# ----------------------------------


## Purchasing Analysis (Age)

- The below each broken into bins of 4 years (i.e. <10, 10-14, 15-19, etc.)
 - Purchase Count
 - Average Purchase Price
 - Total Purchase Value
 - Normalized Totals

In [None]:
# ----------------------------------
# Question 4 - Purchasing Analysis (Age)
# ----------------------------------



# ----------------------------------
# End Step 3
# ----------------------------------

In [None]:
# ----------------------------------
# Step 4: Items analysis
# 1) Most Popular Items
# 2) Most Profitable Items
# ----------------------------------