In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sc
import matplotlib.pyplot as plt 
import statistics as st
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportion_confint 

# Exploratory Data Analysis

In [2]:
original_df = pd.read_excel("/Users/mike/Library/Mobile Documents/com~apple~CloudDocs/My Files (iCloud)/Working/Working Directory/Georgian College/2nd Semester/MRP/Data/Import to Python/Original Postal Codes Data.xlsx", header = 0)

In [3]:
original_df

Unnamed: 0,Camp Kitchi,Day Camp,Child Care,HFA,Philanthropy
0,L4N 4M3,P2A 2R7,L9Y 0C5,3056,L0L 1T0
1,M4J 4V3,L0L 1T0,L4M 2H9,3206,L0L 1W0
2,P0E 1N0,P2A 2X4,L4M 6R8,14216,L9Z 2T2
3,L4R 1X9,L4R 4Y4,L0G 1A0,14513,L9Y 2N2
4,L9W 0A5,L3Z 3C1,L9Y 5L2,15090,L4M 6C5
...,...,...,...,...,...
12555,,,,M6E 3N4,
12556,,,,M6M 5E2,
12557,,,,M9N 1R1,
12558,,,,MC6 3W4,


In [4]:
original_df.describe()

Unnamed: 0,Camp Kitchi,Day Camp,Child Care,HFA,Philanthropy
count,510,560,1094,12559,6330
unique,510,560,1094,12559,6330
top,L0G 1A0,L9Z 2L3,L4M 2Z2,P1P 1K3,P1P 1K3
freq,1,1,1,1,1


In [5]:
market_basket_df = pd.read_excel("/Users/mike/Library/Mobile Documents/com~apple~CloudDocs/My Files (iCloud)/Working/Working Directory/Georgian College/2nd Semester/MRP/Data/Import to Python/Market Basket Analysis Data.xlsx", header = 0)

In [6]:
market_basket_df

Unnamed: 0,Postal Code,Camp Kitchi,Day Camp,Child Care,HFA,Philanthropy,Total Product Used
0,P0E 1N0,1,1,1,1,1,5
1,L9Z 1M1,1,1,1,1,1,5
2,L9Y 5H2,1,1,1,1,1,5
3,L9Y 0Y9,1,1,1,1,1,5
4,L9Y 0E5,1,1,1,1,1,5
...,...,...,...,...,...,...,...
2157,14513,0,0,0,1,0,1
2158,14216,0,0,0,1,0,1
2159,1328KH,0,0,0,1,0,1
2160,11768,0,0,0,0,1,1


## Descriptive Statistics

#### 1. Proportion of clients in different channels who are also donors

In [7]:
### Define count function to get number of clients who also donors
def count_matching(product):
    output = market_basket_df[(market_basket_df[product] == 1) & (market_basket_df['Philanthropy'] == 1)]['Postal Code'].count()
    return output

In [8]:
camp_kitchi_clients_also_donors = round((count_matching('Camp Kitchi') / original_df['Camp Kitchi'].count() * 100), 2)
camp_kitchi_clients_also_donors

34.31

In [9]:
day_camp_clients_also_donors = round((count_matching('Day Camp') / original_df['Day Camp'].count() * 100), 2)
day_camp_clients_also_donors

53.93

In [10]:
child_care_clients_also_donors = round((count_matching('Child Care') / original_df['Child Care'].count() * 100), 2)
child_care_clients_also_donors

37.02

In [11]:
hfa_clients_also_donors = round((count_matching('HFA') / original_df['HFA'].count() * 100), 2)
hfa_clients_also_donors

7.33

Observation: 
- Notice that the the biggest proportion of clients who also are donors is from Day Camp (53.93%)
- The second biggest proportion is from Child Care (37.02%)
#### --> Conduct Inferential Analysis to verify if this difference is significant

## Inferential Statistics

### 1. Hypothesis Testing
- Null Hypothesis: P(Child Care Donors) = P(Day Camp Donors)
- Alternative Hypothesis: P(Child Care Donors) # P(Day Camp Donors)
- Significance Level alpha = 0.05

In [12]:
number_of_successes = np.array([child_care_clients_also_donors, day_camp_clients_also_donors])
total_sample_sizes = np.array([(original_df['Child Care'].count()), (original_df['Day Camp'].count())])
(test_stat, p_value) = proportions_ztest(number_of_successes,total_sample_sizes, alternative = 'two-sided')
print("The computed z-statistics = ", test_stat)
print("p-value = ", p_value)

The computed z-statistics =  -5.273701604976851
p-value =  1.336992298414438e-07


#### p-value << Significance Level --> Reject Null Hypothesis: P(Child Care Donors) # P(Day Camp Donor)

#### Conduct Right-tailed test to check if P(Child Care Donor) < P(Day Camp Donors)
- Null Hypothesis: P(Child Care Donors) >= P(Day Camp Donors)
- Alternative Hypothesis: P(Child Care Donors) < P(Day Camp Donors)
- Significance Level alpha = 0.05

In [13]:
number_of_successes = np.array([child_care_clients_also_donors, day_camp_clients_also_donors])
total_sample_sizes = np.array([(original_df['Child Care'].count()), (original_df['Day Camp'].count())])
(test_stat, p_value) = proportions_ztest(number_of_successes,total_sample_sizes, alternative = 'smaller')
print("The computed z-statistics = ", test_stat)
print("p-value = ", p_value)

The computed z-statistics =  -5.273701604976851
p-value =  6.68496149207219e-08


#### p-value << Significance Level --> Reject Null Hypothesis --> P(Child Care Donors) < P(Day Camp Donor)

### 2. Estimation

#### Calculate Confidence Interval of the true Proportion of Day Camp customers who are also donors

In [14]:
proportion_confint(count = count_matching('Day Camp'),    # Number of successes
                   nobs = original_df['Day Camp'].count(),    # Number of trials
                   alpha = (1 - 0.95))

(0.4980019418973268, 0.5805694866741017)

#### Confidence Interval (True Proportion of Day Camp customers who are also donors) = 49.8% - 58% with Confidence Level of 95%

## Recommendations

- YMCA should spend resources in converting Day Camp customers into Donors since this channel has the biggest conversion rate
- YMCA has roughly 50% - 58% chance to convert a Day Camp customer into a Donor