In [19]:
import pandas as pd
import numpy as np
import warnings

from statsmodels.stats.proportion import proportion_confint
from scipy.stats import chi2_contingency, fisher_exact, binom_test, norm
from itertools import combinations

In [2]:
data = pd.read_csv('churn_analysis.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,state,account_length,area_code,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,...,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,treatment,mes_estim,churn
0,0,KS,128,415,no,yes,25,265.1,110,45.07,...,244.7,91,11.01,10.0,3,2.7,1,1,0.65,False.
1,1,OH,107,415,no,yes,26,161.6,123,27.47,...,254.4,103,11.45,13.7,3,3.7,1,0,0.55,False.
2,2,NJ,137,415,no,no,0,243.4,114,41.38,...,162.6,104,7.32,12.2,5,3.29,0,0,0.72,False.
3,3,OH,84,408,yes,no,0,299.4,71,50.9,...,196.9,89,8.86,6.6,7,1.78,2,1,0.28,False.
4,4,OK,75,415,yes,no,0,166.7,113,28.34,...,186.9,121,8.41,10.1,3,2.73,3,2,0.45,False.


In [3]:
filtered_data = data[data.treatment == 1]
subtable = pd.crosstab(filtered_data[['state']].values.T[0], [filtered_data[['churn']].values.T[0]])
pair_states = list(combinations(data.state.unique(), 2))

In [4]:
temp = [chi2_contingency(subtable.loc[[t[0], t[1]], :], correction=False)[1] for t in pair_states]
len([t for t in temp if t < 0.05])

34

In [7]:
temp = [chi2_contingency(subtable.loc[[t[0], t[1]], :], correction=True)[1] for t in pair_states]
len([t for t in temp if t < 0.05])

0

In [8]:
temp = [fisher_exact(subtable.loc[[t[0], t[1]], :])[1] for t in pair_states]
len([t for t in temp if t < 0.05])

10

In [16]:
data[['day_calls', 'mes_estim']].corr()['day_calls']['mes_estim']

-0.051794350587572605

In [17]:
data[['day_calls', 'mes_estim']].corr(method='spearman')['day_calls']['mes_estim']

0.043349880533927444

In [12]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [13]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [14]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - norm.cdf(z_stat)

In [9]:
treatment_0 = data[data['treatment'] == 0][data['churn'] == 'True.'].shape[0]
treatment_1 = data[data['treatment'] == 1][data['churn'] == 'True.'].shape[0]
treatment_2 = data[data['treatment'] == 2][data['churn'] == 'True.'].shape[0]

treatment_0_all = data[data['treatment'] == 0][data['churn'] == 'False.'].shape[0]
treatment_1_all = data[data['treatment'] == 1][data['churn'] == 'False.'].shape[0]
treatment_2_all = data[data['treatment'] == 2][data['churn'] == 'False.'].shape[0]

# treatment_0_data = [1] * treatment_0 + [0] * treatment_0_all
# treatment_1_data = [1] * treatment_1 + [0] * treatment_1_all
# treatment_2_data = [1] * treatment_2 + [0] * treatment_2_all

treatment_0_data = [0] * treatment_0 + [1] * treatment_0_all
treatment_1_data = [0] * treatment_1 + [1] * treatment_1_all
treatment_2_data = [0] * treatment_2 + [1] * treatment_2_all

  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [15]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" % 
      proportions_diff_confint_ind(treatment_0_data, treatment_1_data))
print("95%% confidence interval for a difference between proportions: [%f, %f]" % 
      proportions_diff_confint_ind(treatment_2_data, treatment_1_data))

95% confidence interval for a difference between proportions: [-0.011583, 0.048489]
95% confidence interval for a difference between proportions: [0.009619, 0.068322]


In [17]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(treatment_0_data, treatment_1_data)))
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(treatment_2_data, treatment_1_data)))
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(treatment_1_data, treatment_0_data)))
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(treatment_1_data, treatment_2_data)))

p-value: 0.228331
p-value: 0.009348
p-value: 0.228331
p-value: 0.009348


In [128]:
temp_data = data[data['treatment'] == 1][data['churn'] == 'True.'].groupby('state')['churn'].count().values #[['state', 'churn']].values
g, p, dof, expctd = chi2_contingency(temp_data)

np.sqrt(g / (np.sum(temp_data) * 2))



0.0

In [21]:
warnings.filterwarnings('ignore')

treatment_0 = data[data['treatment'] == 0][data['churn'] == 'True.'].shape[0]
treatment_1 = data[data['treatment'] == 1][data['churn'] == 'True.'].shape[0]
treatment_2 = data[data['treatment'] == 2][data['churn'] == 'True.'].shape[0]

treatment_0_all = data[data['treatment'] == 0].shape[0]
treatment_1_all = data[data['treatment'] == 1].shape[0]
treatment_2_all = data[data['treatment'] == 2].shape[0]

# print(proportion_confint(treatment_0, treatment_0_all, method = 'wilson'))
# print(proportion_confint(treatment_1, treatment_1_all, method = 'wilson'))
# print(proportion_confint(treatment_2, treatment_2_all, method = 'wilson'))

print(binom_test(treatment_0, treatment_1, 0.5, alternative = 'two-sided'))
print(binom_test(treatment_2, treatment_1, 0.5, alternative = 'two-sided'))

4.06084801562e-33
3.84417031076e-13
