# Pymoli Purchase Data Analysis

In [34]:
# ----------------------------------
# Step 0: Import modules
# ----------------------------------
import pandas as pd

In [35]:
# ----------------------------------
# Step 1: Get the filename, then Read in and clean data.
# ----------------------------------

# Give the user instructions
print("Welcome to Pymoli Data Analysis!")
print("To analyze a file with purchase data, please enter the filename below.")
print("WARNING: Please include the complete filepath! \n")

filename = input("Enter filename here: ")

Welcome to Pymoli Data Analysis!
To analyze a file with purchase data, please enter the filename below.

Enter filename here: data/purchase_data.json


In [155]:
# Read in the data file
purchases_df = pd.read_json(filename, orient='records')

# make sure everything is the right data type to use
purchases_df['Price']= purchases_df['Price'].replace("%","", regex=True).astype(float)

# check and remove null values
purchases_df.dropna(how='any')
purchases_df.head()

Unnamed: 0,Age,Gender,Item ID,Item Name,Price,SN
0,38,Male,165,Bone Crushing Silver Skewer,3.37,Aelalis34
1,21,Male,119,"Stormbringer, Dark Blade of Ending Misery",2.32,Eolo46
2,34,Male,174,Primitive Blade,2.46,Assastnya25
3,21,Male,92,Final Critic,1.36,Pheusrical25
4,23,Male,63,Stormfury Mace,1.27,Aela59


In [302]:
# ----------------------------------
# Step 2: Analyze the players: Look at 
# Total number of players & Gender demographics
# ----------------------------------

# Total Number of players
num_players_total = len(purchases_df.groupby('SN').count())

# count and % of male players
num_players_male = purchases_df[purchases_df['Gender']=='Male'].count()['SN']
percent_players_male = num_players_male/num_players_total

# count and % of female players
num_players_female = purchases_df[purchases_df['Gender']=='Female'].count()['SN']
percent_players_female = num_players_female/num_players_total

# count and % of other/non-disclosed players
num_players_other = purchases_df[purchases_df['Gender']=='Other / Non-Disclosed'].count()['SN']
percent_players_other = num_players_other/num_players_total

# create a dataframe to hold this result
gender_demographics_dict = {"Male":num_players_male, "Female":num_players_female,\
                            "Other/Non-Disclosed":num_players_other}
gender_demographics_df = pd.DataFrame.from_dict(gender_demographics_dict,orient='index')
gender_demographics_df.rename(columns={0: 'Number of Players'}, inplace=True)

# print out results
print("Total Number of Players: ",num_players_total)
gender_demographics_df

# ----------------------------------
# End Step 2
# ----------------------------------

Total Number of Players:  573


Unnamed: 0,Number of Players
Male,633
Female,136
Other/Non-Disclosed,11


In [279]:
# ----------------------------------
# Step 3: Purchasing Analysis
# 1) Top Spenders
# 2) Purchasing Analysis (Total)
# 3) Purchasing Analysis (Gender)
# 4) Age Demographics
# ----------------------------------

# ----------------------------------
# Question 1 - Top Spenders
# ----------------------------------

# identify top five spender by total puchase value
purchases_byPurchaseValue_df = purchases_df.groupby('SN').sum().sort_values('Price',ascending=False)
top_five_spenders_bySN = list(purchases_byPurchaseValue_df[0:5].reset_index()['SN'])

# create purchase count dataframe
top_five_purchase_count_df = pd.DataFrame.from_dict(dict(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                                                         groupby('SN').count()))
top_five_purchase_count_df = top_five_purchase_count_df.drop(['Age', 'Gender', 'Item Name', 'Price'], 1)
top_five_purchase_count_df.rename(columns={'Item ID' : 'Purchase Count'}, inplace=True)

# create purchase average price dataframe
top_five_purchase_avg_df = pd.DataFrame.from_dict(dict(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                                                       groupby('SN').mean()))
top_five_purchase_avg_df = top_five_purchase_avg_df.drop(['Age', 'Item ID'], 1)
top_five_purchase_avg_df.rename(columns={'Price' : 'Average Purchase Price'}, inplace=True)

# create total purchases dataframe
top_five_purchase_total_df = pd.DataFrame.from_dict(dict(purchases_df[purchases_df['SN'].isin(top_five_spenders_bySN)].\
                                                         groupby('SN').sum()))
top_five_purchase_total_df = top_five_purchase_total_df.drop(['Age', 'Item ID'], 1)
top_five_purchase_total_df.rename(columns={'Price' : 'Total Amount Purchased'}, inplace=True)

# merge the dataframes together to create one table
top_spenders_analysis_df = top_five_purchase_count_df.merge(top_five_purchase_avg_df, how='outer',\
                                                            left_index=True, right_index=True)
top_spenders_analysis_df = top_spenders_analysis_df.merge(top_five_purchase_total_df, how='outer',\
                                                          left_index=True, right_index=True)

# fix the formatting for the currency columns: df['cost'] = df['cost'].map('${:,.2f}'.format)
top_spenders_analysis_df['Average Purchase Price'] = top_spenders_analysis_df['Average Purchase Price'].map('${:,.2f}'.format)
top_spenders_analysis_df['Total Amount Purchased'] = top_spenders_analysis_df['Total Amount Purchased'].map('${:,.2f}'.format)

# sort by total amount purchased
top_spenders_analysis_df = top_spenders_analysis_df.sort_values('Total Amount Purchased', ascending=False)
top_spenders_analysis_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Amount Purchased
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Undirrala66,5,$3.41,$17.06
Saedue76,4,$3.39,$13.56
Mindimnya67,4,$3.18,$12.74
Haellysu29,3,$4.24,$12.73
Eoda93,3,$3.86,$11.58


In [172]:
# ----------------------------------
# Question 2 - Purchasing Analysis (Total)
# ----------------------------------

# number of unique items purchased
num_items_unique = len(purchases_df['Item ID'].value_counts())

# average purchase price
overall_purchase_avg = '${:,.2f}'.format(purchases_df.mean()['Price'])

# total number of purchases
overall_purchase_count = purchases_df.count()['Item ID']

# total revenue
overall_purchase_revenue = '${:,.2f}'.format(purchases_df['Price'].sum())

# create dataframe to display results
purchasing_total_analysis_df = pd.DataFrame({'Number of Unique Items Purchased':[num_items_unique],\
                                             'Average Purchase Total':[overall_purchase_avg],\
                                             'Total Number of Purchases':[overall_purchase_count],\
                                             'Total Revenue':[overall_purchase_revenue]})

# rearrange columns in a more sensible way
purchasing_total_analysis_df = purchasing_total_analysis_df[['Total Revenue', 'Total Number of Purchases',\
                                                            'Average Purchase Total', 'Number of Unique Items Purchased']]
purchasing_total_analysis_df

Unnamed: 0,Total Revenue,Total Number of Purchases,Average Purchase Total,Number of Unique Items Purchased
0,"$2,286.33",780,$2.93,183


## Purchasing Analysis (Gender)

- The below each broken by gender
 - Purchase Count
 - Average Purchase Price
 - Total Purchase Value
 - Normalized Totals (total purchase value/purchase count)

In [292]:
# ----------------------------------
# Question 3 - Purchasing Analysis (Gender)
# ----------------------------------

# purchase count by gender
gender_purchase_count_df = pd.DataFrame.from_dict(dict(purchases_df.groupby('Gender').count()))
gender_purchase_count_df = gender_purchase_count_df.drop(['Age', 'Item Name', 'Item ID', 'Price'], 1)
gender_purchase_count_df.rename(columns={'SN' : 'Purchase Count'}, inplace=True)

# average purchase price by gender
gender_purchase_avg_df = pd.DataFrame.from_dict(dict(purchases_df.groupby('Gender').mean()))
gender_purchase_avg_df = gender_purchase_avg_df.drop(['Age', 'Item ID'], 1)
gender_purchase_avg_df.rename(columns={'Price' : 'Average Purchase Price'}, inplace=True)

# sum of all purchases by gender
gender_purchase_total_df = pd.DataFrame.from_dict(dict(purchases_df.groupby('Gender').sum()))
gender_purchase_total_df = gender_purchase_total_df.drop(['Age', 'Item ID'], 1)
gender_purchase_total_df.rename(columns={'Price' : 'Total Purchases'}, inplace=True)

# create dataframe to display results
purchasing_gender_analysis_df = gender_purchase_count_df.merge(gender_purchase_total_df, how='outer',\
                                                            left_index=True, right_index=True)
purchasing_gender_analysis_df = purchasing_gender_analysis_df.merge(gender_purchase_avg_df, how='outer',\
                                                            left_index=True, right_index=True)

# add normalized totals by gender (total purchase value/purchase count)
purchasing_gender_analysis_df["Normalized Total"] = purchase
purchasing_gender_analysis_df

Unnamed: 0_level_0,Purchase Count,Total Purchases,Average Purchase Price,Normalized Total
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,136,382.91,2.815515,2.815515
Male,633,1867.68,2.950521,2.950521
Other / Non-Disclosed,11,35.74,3.249091,3.249091



## Purchasing Analysis (Age)

- The below each broken into bins of 4 years (i.e. <10, 10-14, 15-19, etc.)
 - Purchase Count
 - Average Purchase Price
 - Total Purchase Value
 - Normalized Totals

In [None]:
# ----------------------------------
# Question 4 - Purchasing Analysis (Age)
# ----------------------------------



# ----------------------------------
# End Step 3
# ----------------------------------

In [None]:
# ----------------------------------
# Step 4: Items analysis
# 1) Most Popular Items
# 2) Most Profitable Items
# ----------------------------------