In [1]:
import pandas as pd
file_to_load = "purchase_data.csv"

In [2]:
purchase_data = pd.read_csv(file_to_load)
purchase_data

Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44
...,...,...,...,...,...,...,...
775,775,Aethedru70,21,Female,60,Wolf,3.54
776,776,Iral74,21,Male,164,Exiled Doomblade,1.63
777,777,Yathecal72,20,Male,67,"Celeste, Incarnation of the Corrupted",3.46
778,778,Sisur91,7,Male,92,Final Critic,4.19


In [3]:
#refined purchase data index
index_purchase_data = purchase_data.set_index("Purchase ID")
index_purchase_data.head()

#found unique player amount
player_count = index_purchase_data["SN"].value_counts()
amount = len(player_count)

#created player count dataframe to show results
player_count_df = pd.DataFrame({"Player_Count":amount},index=[0])
player_count_df

Unnamed: 0,Player_Count
0,576


In [4]:
#unique item total
unique_item_name = index_purchase_data["Item Name"].value_counts()
amount_item = len(unique_item_name)

#average price
average_price = index_purchase_data["Price"].mean()

#number of purchases
number_of_purchases = index_purchase_data["Item Name"].count()

#sum of revenue
sum_of_revenue = index_purchase_data["Price"].sum()

purchasing_analysis_total_df = pd.DataFrame({"Number_of_Unique_Items":amount_item,
                                            "Average_Price":average_price,
                                             "Number_of_Purchases":number_of_purchases,
                                             "Total_Revenue":sum_of_revenue}, index=[0])

#formatting of columns for average_price and total_revenue
purchasing_analysis_total_df["Average_Price"] = purchasing_analysis_total_df["Average_Price"].map("${:.2f}".format)
purchasing_analysis_total_df["Total_Revenue"] = purchasing_analysis_total_df["Total_Revenue"].map("${:.2f}".format)
purchasing_analysis_total_df

Unnamed: 0,Number_of_Unique_Items,Average_Price,Number_of_Purchases,Total_Revenue
0,179,$3.05,780,$2379.77


In [116]:
#gender demographics
#want to check for unique values to ensure no misspellings, etc.
unique = index_purchase_data["Gender"].unique()
unique

#collected list of gender and unique values of each gender
gender_count = index_purchase_data["Gender"].value_counts()
gender_count.head()

#so what I figured out here is that some of the same people are buying so that's why there are more male than players altogether
#therefore I need to narrow by unique users and then from there gather gender data

user = index_purchase_data.groupby('Gender')['SN'].unique()
female = len(user[0])
male = len(user[1])
other = len(user[2])

female_percent = (female/amount)*100
male_percent = (male/amount)*100
other_percent = (other/amount)*100

#index_values = [Male, Female, Other/Non-disclosed]
data = [[male],[female],[other]]
gender_demographic_df = pd.DataFrame(data, columns = ['Total_Count'], index = ['Male', 'Female', 'Other / Non-Disclosed'])
gender_demographic_df['Percentage_of_Players'] = gender_demographic_df['Total_Count'] / amount * 100

#formatting final column
gender_demographic_df["Percentage_of_Players"] = gender_demographic_df["Percentage_of_Players"].map("{:.2f}%".format)
gender_demographic_df


Unnamed: 0,Total_Count,Percentage_of_Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


In [27]:
#purchasing analysis by gender
purchase_count = index_purchase_data.groupby('Gender')['Item Name'].count()
purchase_count 

average_purchase_price = index_purchase_data.groupby('Gender')['Price'].mean()
average_purchase_price

total_purchase_value = index_purchase_data.groupby('Gender')['Price'].sum()
total_purchase_value

female_average_total_per_person = total_purchase_value[0]/female
male_average_total_per_person = total_purchase_value[1]/male
other_average_total_per_person = total_purchase_value[2]/other

average_total_per_person = [female_average_total_per_person, male_average_total_per_person, other_average_total_per_person]

gender_purchase_analysis_df = pd.DataFrame({'Purchase_Count': purchase_count,
                                            'Average_Purchase_Price': average_purchase_price, 
                                            'Total_Purchase_Value': total_purchase_value, 
                                            'Average_Total_Purchase_Per_Person':average_total_per_person})

#formatting of columns
gender_purchase_analysis_df['Average_Purchase_Price'] = gender_purchase_analysis_df['Average_Purchase_Price'].map("${:.2f}".format)
gender_purchase_analysis_df['Total_Purchase_Value'] = gender_purchase_analysis_df['Total_Purchase_Value'].map("${:,.2f}".format)
gender_purchase_analysis_df['Average_Total_Purchase_Per_Person'] = gender_purchase_analysis_df['Average_Total_Purchase_Per_Person'].map("${:.2f}".format)

gender_purchase_analysis_df

Unnamed: 0_level_0,Purchase_Count,Average_Purchase_Price,Total_Purchase_Value,Average_Total_Purchase_Per_Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.20,$361.94,$4.47
Male,652,$3.02,"$1,967.64",$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [114]:
#age demographics
bins = [0,9.9,14,19,24,29.9,34.9,39,100]
group_names = ['<10','10-14','15-19','20-24','25-29','30-34','35-39','40+']
index_purchase_data["Age Demographics"] = pd.cut(index_purchase_data["Age"], bins, labels=group_names, include_lowest=True)

#so from this, I have a similar problem as before, there are some people who purchased more than one game
#therefore, I did have to make a unique list again, and use the length of each of the inner lists within it
total_counts_age = index_purchase_data.groupby('Age Demographics')['SN'].unique()
age_0 = len(total_counts_age[0])
age_1 = len(total_counts_age[1])
age_2 = len(total_counts_age[2])
age_3 = len(total_counts_age[3])
age_4 = len(total_counts_age[4]) 
age_5 = len(total_counts_age[5]) 
age_6 = len(total_counts_age[6])
age_7 = len(total_counts_age[7]) 

total_count_age_list = [age_0, age_1, age_2, age_3, age_4, age_5, age_6, age_7]

age_demographic_df = pd.DataFrame({'Total_Count': total_count_age_list}, index = group_names)
age_demographic_df['Percentage_of_Players'] = age_demographic_df['Total_Count'] / amount * 100

#format column
age_demographic_df['Percentage_of_Players'] = age_demographic_df['Percentage_of_Players'].map("{:.2f}%".format)
age_demographic_df

Unnamed: 0,Total_Count,Percentage_of_Players
<10,17,2.95%
10-14,22,3.82%
15-19,107,18.58%
20-24,258,44.79%
25-29,77,13.37%
30-34,52,9.03%
35-39,31,5.38%
40+,12,2.08%
