In [1]:
import pandas as pd

In [2]:
file_path = "Resources/purchase_data.csv"
purchase_data = pd.read_csv(file_path)
purchase_data.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


Part 1: Review data per column to determine if clean up is needed and display the count of total number of players

In [3]:
#Identify any incomplete rows
purchase_data.count()

Purchase ID    780
SN             780
Age            780
Gender         780
Item ID        780
Item Name      780
Price          780
dtype: int64

In [4]:
#Review datatypes 
purchase_data.dtypes

Purchase ID      int64
SN              object
Age              int64
Gender          object
Item ID          int64
Item Name       object
Price          float64
dtype: object

In [5]:
#Use pd.to_nuermic() method to convert the datatype of the Price column
purchase_data['Price'] = pd.to_numeric(purchase_data['Price'])

In [6]:
#Verify that datatype change to Price column worked
purchase_data['Price'].dtype

dtype('float64')

In [7]:
#Display any unique values and count for screen name column
purchase_data.loc[:, 'SN'].value_counts()

Lisosia93       5
Iral74          4
Idastidru52     4
Saistyphos30    3
Saedaiphos46    3
               ..
Chamalo71       1
Aillyrin83      1
Tyarithn67      1
Sundim98        1
Jiskjask85      1
Name: SN, Length: 576, dtype: int64

In [53]:
counts = len(purchase_data["SN"].drop_duplicates())
counts_df = pd.DataFrame({'Total Players':[counts]})
counts_df

Unnamed: 0,Total Players
0,576


Part 2 - Purchasing Analysis (Total): Run basic calculations to obtain the number of unique items, averge purchase price, total number of purchases, and total revenue. 
    Create a summary data frame to hold the results and display the summary data frame with clean formatting.

In [9]:
#Calculate the number of unique Items in the DataFrame
un_items = len(purchase_data["Item ID"].unique())
un_items

179

In [10]:
#Calculate the average purchase price
avg_price = purchase_data["Price"].mean()
avg_price.round(2)

3.05

In [11]:
#Calculate the total number of purchases
purchase_total = len(purchase_data["Purchase ID"].unique())
purchase_total

780

In [12]:
#Calculate the total revenue
total_sum = purchase_data["Price"].sum()
total_sum

2379.77

In [13]:
#Place all of the data found into a summary DataFrame
data = {
    'Number of Unique Items': [un_items],
    'Average Price': [avg_price],
    'Number of Purchases': [purchase_total],
    'Total Revenue': [total_sum]
}

summary1 = pd.DataFrame(data)
summary1
format_dict = {'Average Price':'${0:,.2f}', 'Total Revenue': '${:,.2f}'}
summary1.style.format(format_dict)

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


Part 3 - Gender Demographics: Determine the percentage and count of male players, female player, and of Other/Non-Disclosed

In [14]:
#Remove any duplicate screen names as before
count_m = purchase_data.drop_duplicates(subset = "SN", keep="first")
count_m.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [15]:
#Determine total count of males
total_m = len(count_m.loc[purchase_data["Gender"] == "Male"])
total_m

484

In [67]:
#Determine percentage of male players
total_m_p = (total_m / counts)*100
total_m_p

84.02777777777779

In [24]:
#Determine total count of females
total_f = len(count_m.loc[purchase_data["Gender"] == "Female"])
total_f

81

In [61]:
#Determine percentage of female players
total_f_p = (total_f / counts)*100
total_f_p

14.0625

In [62]:
#Determine total count of Other/Non-Disclosed
total_o = len(count_m.loc[purchase_data["Gender"] == "Other / Non-Disclosed"])
total_o

11

In [63]:
#Determine percentage of Other/Non-Disclosed players
total_o_p = (total_o / counts)*100
total_o_p

1.9097222222222223

In [69]:
#Place all of the gender demographics into a summary DataFrame

gendem = pd.DataFrame(columns=['Total Count', 'Percentage of Players'],
                                  index = ['Male', 'Female', 'Other/Non-disclosed'])
gendem.loc['Male'] = pd.Series({
    'Total Count': total_m,
    'Percentage of Players': total_m_p})
gendem.loc['Female'] = pd.Series({
    'Total Count': total_f,
    'Percentage of Players': total_f_p,})
gendem.loc['Other/Non-disclosed'] = pd.Series({
    'Total Count': total_o,
    'Percentage of Players': total_o_p,})

format2 = {'Total Count': '{:,.0f}', 'Percentage of Players': '{:,.2f}%'}
gendem.style.format(format2)

Unnamed: 0,Total Count,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other/Non-disclosed,11,1.91%


Part 4 - Purchasing Analysis (Gender): Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person, etc. by gender. Create a clean and formatted summary data frame to hold the results. MUST USE GROUPBY METHOD (GENDER).

In [113]:
#For purchase analysis, must use groupby method
grouped_df = purchase_data.groupby(['Gender'])

In [114]:
#Determine data calculations with groupby method
count = grouped_df["SN"].count()
count

Gender
Female                   113
Male                     652
Other / Non-Disclosed     15
Name: SN, dtype: int64

In [115]:
price = grouped_df["Price"].mean()
price#using .reset_index() has changed it from a Series to a Data Frame

Gender
Female                   3.203009
Male                     3.017853
Other / Non-Disclosed    3.346000
Name: Price, dtype: float64

In [116]:
total = grouped_df["Price"].sum()
total

Gender
Female                    361.94
Male                     1967.64
Other / Non-Disclosed      50.19
Name: Price, dtype: float64

In [118]:
#Find average price per person
clean_df = purchase_data.drop_duplicates(subset='SN', keep='first')
group2 = clean_df.groupby(['Gender'])

In [119]:
avg_ppp = (grouped_df["Price"].sum() / group2["SN"].count())
avg_ppp

Gender
Female                   4.468395
Male                     4.065372
Other / Non-Disclosed    4.562727
dtype: float64

In [120]:
#Display the summary in a data frame
#.map() method used to manipulate pandas Series

summary_df4 = pd.DataFrame({
    'Purchase Count': count,
    'Average Purchase Price': price,
    'Total Purchase Value': total,
    'Avg Total Purchase per Person': avg_ppp})

#Apply formatting
summary_df4['Average Purchase Price'] = summary_df4['Average Purchase Price'].map('${:.3}'.format)
summary_df4['Total Purchase Value'] = summary_df4['Total Purchase Value'].map('${:,.2f}'.format)
summary_df4['Avg Total Purchase per Person'] = summary_df4['Avg Total Purchase per Person'].map('${:.3}'.format)

summary_df4.head()

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.2,$361.94,$4.47
Male,652,$3.02,"$1,967.64",$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56
