In [1]:
import pandas as pd

In [2]:
file_path = "Resources/purchase_data.csv"
purchase_data = pd.read_csv(file_path)
purchase_data.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


Part 1: Review data per column to determine if clean up is needed and display the count of total number of players

In [3]:
#Identify any incomplete rows
purchase_data.count()

Purchase ID    780
SN             780
Age            780
Gender         780
Item ID        780
Item Name      780
Price          780
dtype: int64

In [4]:
#Review datatypes 
purchase_data.dtypes

Purchase ID      int64
SN              object
Age              int64
Gender          object
Item ID          int64
Item Name       object
Price          float64
dtype: object

In [5]:
#Use pd.to_nuermic() method to convert the datatype of the Price column
purchase_data['Price'] = pd.to_numeric(purchase_data['Price'])

In [6]:
#Verify that datatype change to Price column worked
purchase_data['Price'].dtype

dtype('float64')

In [7]:
#Display any unique values and count for screen name column
purchase_data.loc[:, 'SN'].value_counts()

Lisosia93        5
Idastidru52      4
Iral74           4
Tyisur83         3
Inguron55        3
                ..
Thourdirra92     1
Frichaststa61    1
Yalostiphos68    1
Jiskjask60       1
Chanirra64       1
Name: SN, Length: 576, dtype: int64

In [8]:
counts = len(purchase_data["SN"].drop_duplicates())
counts_df = pd.DataFrame({'Total Players':[counts]})
counts_df

Unnamed: 0,Total Players
0,576


Part 2 - Purchasing Analysis (Total): Run basic calculations to obtain the number of unique items, averge purchase price, total number of purchases, and total revenue. 
    Create a summary data frame to hold the results and display the summary data frame with clean formatting.

In [9]:
#Calculate the number of unique Items in the DataFrame
un_items = len(purchase_data["Item ID"].unique())

In [10]:
#Calculate the average purchase price
avg_price = purchase_data["Price"].mean()

In [11]:
#Calculate the total number of purchases
purchase_total = len(purchase_data["Purchase ID"].unique())

In [12]:
#Calculate the total revenue
total_sum = purchase_data["Price"].sum()

In [13]:
#Place all of the data found into a summary DataFrame
data = {
    'Number of Unique Items': [un_items],
    'Average Price': [avg_price],
    'Number of Purchases': [purchase_total],
    'Total Revenue': [total_sum]
}

summary1 = pd.DataFrame(data)
summary1
format_dict = {'Average Price':'${0:,.2f}', 'Total Revenue': '${:,.2f}'}
summary1.style.format(format_dict)

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


Part 3 - Gender Demographics: Determine the percentage and count of male players, female player, and of Other/Non-Disclosed

In [14]:
#Remove any duplicate screen names as before
count_m = purchase_data.drop_duplicates(subset = "SN", keep="first")

In [15]:
#Determine total count of males
total_m = len(count_m.loc[purchase_data["Gender"] == "Male"])

In [16]:
#Determine percentage of male players
total_m_p = (total_m / counts)*100

In [17]:
#Determine total count of females
total_f = len(count_m.loc[purchase_data["Gender"] == "Female"])

In [18]:
#Determine percentage of female players
total_f_p = (total_f / counts)*100

In [19]:
#Determine total count of Other/Non-Disclosed
total_o = len(count_m.loc[purchase_data["Gender"] == "Other / Non-Disclosed"])

In [20]:
#Determine percentage of Other/Non-Disclosed players
total_o_p = (total_o / counts)*100

In [21]:
#Place all of the gender demographics into a summary DataFrame

gendem = pd.DataFrame(columns=['Total Count', 'Percentage of Players'],
                                  index = ['Male', 'Female', 'Other/Non-disclosed'])
gendem.loc['Male'] = pd.Series({
    'Total Count': total_m,
    'Percentage of Players': total_m_p})
gendem.loc['Female'] = pd.Series({
    'Total Count': total_f,
    'Percentage of Players': total_f_p,})
gendem.loc['Other/Non-disclosed'] = pd.Series({
    'Total Count': total_o,
    'Percentage of Players': total_o_p,})

format2 = {'Total Count': '{:,.0f}', 'Percentage of Players': '{:,.2f}%'}
gendem.style.format(format2)

Unnamed: 0,Total Count,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other/Non-disclosed,11,1.91%


Part 4 - Purchasing Analysis (Gender): Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person, etc. by gender. Create a clean and formatted summary data frame to hold the results. MUST USE GROUPBY METHOD (GENDER).

In [22]:
#For purchase analysis, must use groupby method
grouped_df = purchase_data.groupby(['Gender'])

In [23]:
#Determine data calculations with groupby method
count = grouped_df["SN"].count()

In [24]:
price = grouped_df["Price"].mean()

In [25]:
total = grouped_df["Price"].sum()

In [26]:
#Find average price per person
clean_df = purchase_data.drop_duplicates(subset='SN', keep='first')
group2 = clean_df.groupby(['Gender'])

In [27]:
avg_ppp = (grouped_df["Price"].sum() / group2["Age"].count())

In [28]:
#Display the summary in a data frame
#.map() method used to manipulate pandas Series

summary_df4 = pd.DataFrame({
    'Purchase Count': count,
    'Average Purchase Price': price,
    'Total Purchase Value': total,
    'Avg Total Purchase per Person': avg_ppp})

#Apply formatting
summary_df4['Average Purchase Price'] = summary_df4['Average Purchase Price'].map('${:.3}'.format)
summary_df4['Total Purchase Value'] = summary_df4['Total Purchase Value'].map('${:,.2f}'.format)
summary_df4['Avg Total Purchase per Person'] = summary_df4['Avg Total Purchase per Person'].map('${:.3}'.format)

summary_df4.head()

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.2,$361.94,$4.47
Male,652,$3.02,"$1,967.64",$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


Part 5 - Age Demographics: Establish BINS for ages. Categorize the existing players using the age bins (i.e. use pd.cut() method). Calculate the numbers and percentages by age group and create a summary data frame to hold the reults.

In [29]:
# purchase_data.drop_duplicates(subset='SN', keep='first')

In [30]:
#Create the bins in which age data will be held
bins = [0, 9.9, 14.9, 19.9, 24.9, 29.9, 34.9, 39.9, 200]
# bins = [0, 10, 15, 20, 25, 30, 35, 40, 200]
group_names = ["<10", "10-14",  "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]

In [31]:
purchase_data["Age Count"] = pd.cut(purchase_data["Age"], bins, labels=group_names, include_lowest=True)

In [32]:
group_age = purchase_data.groupby("Age Count")

In [33]:
age_total_count = group_age["SN"].nunique()

In [34]:
#Calculate total numbers and percentages by age group
age_ppp2 = age_total_count / counts *100

In [35]:
#Create a summary data frame to hold the results
sum_dict = {
    "Total Count": age_total_count,
    "Percentage of Players": age_ppp2}

agedem_df = pd.DataFrame(sum_dict)
#Apply formatting
agedem_df['Percentage of Players'] = agedem_df['Percentage of Players'].map('{:,.2f}%'.format)
agedem_df

Unnamed: 0_level_0,Total Count,Percentage of Players
Age Count,Unnamed: 1_level_1,Unnamed: 2_level_1
<10,17,2.95%
10-14,22,3.82%
15-19,107,18.58%
20-24,258,44.79%
25-29,77,13.37%
30-34,52,9.03%
35-39,31,5.38%
40+,12,2.08%


Part 6 - Purchasing Analysis (Age): BIN the purchase_data frame by age. Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total person person, etc.
Create a summary data frame to hold reults and format.

In [36]:
#Create the bins in which age data will be held
bins2 = [0, 9.9, 14.9, 19.9, 24.9, 29.9, 34.9, 39.9, 200]
groups = ["<10", "10-14",  "15-19", "20-24", "25-29", "30-34", "35-39", "40+"]

In [37]:
#Slice the data any place it into bins
pd.cut(purchase_data["Age"], bins=bins2, labels=groups).head()

0    20-24
1      40+
2    20-24
3    20-24
4    20-24
Name: Age, dtype: category
Categories (8, object): ['<10' < '10-14' < '15-19' < '20-24' < '25-29' < '30-34' < '35-39' < '40+']

In [38]:
# Place the data series into a new column inside of the DataFrame
purchase_data["Age"] = pd.cut(purchase_data["Age"], bins, labels=group_names)

In [39]:
# Create a GroupBy object based upon "Age"
purch_age = purchase_data.groupby("Age")

In [106]:
# Purchase count by age analysis
age_counts = purch_age[["Age"]].count()
age_counts
type(age_counts)
age_cnt = list(age_counts)
age_cnt

['Age']

In [107]:
# Get the average of each column within the GroupBy object
avg_pprice = purch_age[["Price"]].mean()
avg_pprice
avg_p = list(avg_pprice)
avg_p

['Price']

In [108]:
# Get the total purchase value of each column within the GroupBy object
total_pvalue = purch_age[["Price"]].sum()
total_pvalue
total_pv = list(avg_pprice)
total_pv

['Price']

In [43]:
#Find the average price per person and clean duplicates
total_value = purchase_data.drop_duplicates(subset='SN', keep='first')
group3 = total_value.groupby(["Age"])

In [99]:
avg_ppp3 = (purch_age["Price"].sum() / group3["Age"].count())
avg_ppp3
avg3 = list(avg_ppp3)
avg3

[4.537058823529412,
 3.7627272727272723,
 3.85878504672897,
 4.3180620155038785,
 3.805194805194803,
 4.115384615384615,
 4.763548387096773,
 3.186666666666667]

In [112]:
#Display the summary in a data frame and format
# dict_sum1 = {
#     'Purchase Count': age_counts,
#     'Average Purchase Price': avg_pprice,
#     'Total Purchase Value': total_pvalue,
#     'Avg Total Purchase per Person': avg_ppp3}

dict_sum1 = {
    'Purchase Count': age_cnt,
    'Average Purchase Price': avg_p,
    'Total Purchase Value': total_pv,
    'Avg Total Purchase per Person': avg3}

summary_df5 = pd.DataFrame.from_dict(dict_sum1)
                                     #, orient='index')
summary_df5 = summary_df5.transpose()
#                            columns=['Purchase Count', 'Average Purchase Price', 'Total Purchase Value', 'Avg Total Purchase per Person'])
summary_df5.head()

# #Apply formatting
# summary_df5['Average Purchase Price'] = summary_df5['Average Purchase Price'].map('${:.3}'.format)
# summary_df5['Total Purchase Value'] = summary_df5['Total Purchase Value'].map('${:,.2f}'.format)
# summary_df5['Avg Total Purchase per Person'] = summary_df5['Avg Total Purchase per Person'].map('${:.3}'.format)
# # summary_df5 = summary_df5["Purchase Count", "Average Purchase Price", "Total Purchase Value", "Avg Total Purchase per Person"]
# summary_df5.head()


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
0,Age,Price,Price,4.537059
1,,,,3.762727
2,,,,3.858785
3,,,,4.318062
4,,,,3.805195


Part 7 - Top Spenders: Run basic calculations to obtain the resutls of purchase count, average purchase price, and total purchase value by top spenders. SORT the total purchase value column in descending order. Display a summary data frame and format. 

In [54]:
sn_grouped = purchase_data.groupby(['SN'])
sn_grouped.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price,Age Count
0,0,Lisim78,20-24,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53,20-24
1,1,Lisovynya38,40+,Male,143,Frenzied Scimitar,1.56,40+
2,2,Ithergue48,20-24,Male,92,Final Critic,4.88,20-24
3,3,Chamassasya86,20-24,Male,100,Blindscythe,3.27,20-24
4,4,Iskosia90,20-24,Male,131,Fury,1.44,20-24
...,...,...,...,...,...,...,...,...
775,775,Aethedru70,20-24,Female,60,Wolf,3.54,20-24
776,776,Iral74,20-24,Male,164,Exiled Doomblade,1.63,20-24
777,777,Yathecal72,20-24,Male,67,"Celeste, Incarnation of the Corrupted",3.46,20-24
778,778,Sisur91,<10,Male,92,Final Critic,4.19,<10


In [57]:
print(sn_grouped["SN", "Price"].max())

                          SN  Price
SN                                 
Adairialis76    Adairialis76   2.28
Adastirin33      Adastirin33   4.48
Aeda94                Aeda94   4.91
Aela59                Aela59   4.32
Aelaria33          Aelaria33   1.79
...                      ...    ...
Yathecal82        Yathecal82   2.42
Yathedeu43        Yathedeu43   3.75
Yoishirrala98  Yoishirrala98   4.58
Zhisrisu83        Zhisrisu83   4.35
Zontibe81          Zontibe81   3.79

[576 rows x 2 columns]


  print(sn_grouped["SN", "Price"].max())


In [52]:
sn_count = sn_grouped["SN"].count()
sn_count

SN
Adairialis76     1
Adastirin33      1
Aeda94           1
Aela59           1
Aelaria33        1
                ..
Yathecal82       3
Yathedeu43       2
Yoishirrala98    1
Zhisrisu83       2
Zontibe81        3
Name: SN, Length: 576, dtype: int64

In [53]:
sn_price = sn_grouped["Price"].mean()
sn_price

SN
Adairialis76     2.280000
Adastirin33      4.480000
Aeda94           4.910000
Aela59           4.320000
Aelaria33        1.790000
                   ...   
Yathecal82       2.073333
Yathedeu43       3.010000
Yoishirrala98    4.580000
Zhisrisu83       3.945000
Zontibe81        2.676667
Name: Price, Length: 576, dtype: float64