In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
plt.ioff()

import seaborn as sns
sns.set()

import numpy as np
import math

import analysis_tools as tools

# Data Preparation

## Initialization of Values

In [2]:
M_cnt = 13
F_cnt = 10
q_cnt = 10

### Results Gathered

In [3]:
M_D = np.array([[0, 2, 5, 4, 2],
                [1, 2, 4, 3, 3],
                [0, 2, 5, 6, 0],
                [0, 2, 6, 3, 2],
                [0, 3, 2, 3, 5],
                [0, 4, 6, 1, 2],
                [2, 4, 5, 1, 1],
                [1, 4, 3, 3, 2],
                [0, 4, 3, 4, 2],
                [1, 5, 7, 0, 0]])

In [4]:
F_D = np.array([[0, 0, 9, 1, 0],
                [1, 3, 5, 1, 0],
                [0, 0, 4, 3, 3],
                [0, 3, 3, 4, 0],
                [1, 3, 4, 1, 1],
                [0, 8, 0, 1, 1],
                [1, 5, 2, 2, 0],
                [1, 1, 5, 1, 2],
                [0, 3, 3, 2, 2],
                [0, 2, 4, 4, 0]])

#### Sha's Results

In [5]:
M_cnt = 14
F_cnt = 11
q_cnt = 10

In [6]:
M_D = np.array([[1, 3, 7, 3, 0],
                [4, 6, 2, 1, 1],
                [0, 2, 5, 6, 1],
                [2, 2, 2, 7, 1],
                [0, 1, 4, 4, 5],
                [0, 8, 4, 1, 1],
                [1, 6, 2, 3, 2],
                [3, 2, 3, 2, 4],
                [2, 1, 2, 5, 4],
                [4, 3, 4, 2, 1]])

In [7]:
F_D = np.array([[0, 2, 4, 2, 3],
                [2, 1, 4, 3, 1],
                [1, 4, 2, 2, 2],
                [0, 1, 6, 2, 2],
                [0, 2, 3, 3, 3],
                [0, 4, 6, 1, 0],
                [0, 4, 4, 3, 0],
                [0, 1, 5, 3, 2],
                [0, 1, 3, 5, 2],
                [1, 2, 5, 3, 0]])

### Preprocessing of the Results

In [8]:
def expand_dist(X):
    return np.array([[0]*x[0] +
                     [1]*x[1] +
                     [2]*x[2] +
                     [3]*x[3] +
                     [4]*x[4] for x in X])

In [9]:
M = expand_dist(M_D)
M

array([[0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4],
       [1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
       [0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4],
       [1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4],
       [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4],
       [0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4],
       [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4],
       [0, 0, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4],
       [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4]])

In [10]:
F = expand_dist(F_D)
F

array([[1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4],
       [0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 4],
       [0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4],
       [1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4],
       [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
       [1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3],
       [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3],
       [1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4],
       [1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4],
       [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3]])

## Checking the Validity of the Arrays

In [11]:
def check_array(X, X_D, cnt):
    for i in range(10):
        if sum(X_D[i]) != cnt:
            return False
        if len(X[i]) != cnt:
            return False
    return True

In [12]:
check_array(M, M_D, M_cnt)

True

In [13]:
check_array(F, F_D, F_cnt)

True

# Data Analysis

## The values themselves

### Means

In [14]:
M.mean()

2.0428571428571427

In [15]:
F.mean()

2.2454545454545456

### Standard Deviation

In [16]:
np.std(M)

1.2413784306823061

In [17]:
np.std(F)

1.0373599676483276

### Equations

Males: $ 2.18 + or - 1.08 $

Females: $ 2.02 + or - 1.00 $

#### Sha's

Males: $ 2.04 + or - 1.24 $

Females: $ 2.25 + or - 1.04 $

## Distribution of the Values

In [18]:
def plot_dist(X, ax, hist=True, rug=False, kde=True, shade=True, c="b", label=None, save_file=None):
    ax = sns.distplot(X,
                      bins=[0, 1, 2, 3, 4, 5],
                      color=c,
                      hist=hist,
                      rug=rug,
                      kde=kde,
                      hist_kws={"align":"left", "rwidth":1.0, "color":c},
                      kde_kws={"shade":shade, "bw":0.5},
                      ax=ax,
                      label=label,
                      norm_hist=True)
    ax.set_xticks([0, 1, 1.80, 3, 4])
    ax.set_xticklabels(["always", "often", "sometimes", "rarely", "never"])
    ax.set_yticks([])
    
    plt.tight_layout()
    plt.xticks(rotation=30) 
    
    ax.set_xlim(-1, 5)
    
    if save_file != None:
        plt.savefig("%s.png" % save_file, transparent=True)

### Individual Habit

#### Plots

In [19]:
for lh in range(10):
    fig, ax = plt.subplots(1, 1, figsize=(3, 3))
    plot_dist(M[lh], ax, hist=False, kde=True, shade=False, c='b', label='males')
    plot_dist(F[lh], ax, hist=False, kde=True, shade=False, c='r', label='females', save_file='plots/english/males_v_females_habit_{}'.format(lh+1))
    plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
M.mean(axis=1)

array([ 1.85714286,  1.21428571,  2.42857143,  2.21428571,  2.92857143,
        1.64285714,  1.92857143,  2.14285714,  2.57142857,  1.5       ])

In [21]:
F.mean(axis=1)

array([ 2.54545455,  2.        ,  2.        ,  2.45454545,  2.63636364,
        1.72727273,  1.90909091,  2.54545455,  2.72727273,  1.90909091])

### Overall Distribution

In [22]:
M_D_S = M_D.T.sum(axis=1)
M_D_S

array([17, 34, 35, 34, 20])

In [23]:
F_D_S = F_D.T.sum(axis=1)
F_D_S

array([ 4, 22, 42, 27, 15])

#### Stats

In [24]:
M_D_S_stats = tools.get_stats(M_D_S)
M_D_S_stats

ArrayStats(min=17, ave=28.0, max=35, range=18)

In [25]:
F_D_S_stats = tools.get_stats(F_D_S)
F_D_S_stats

ArrayStats(min=4, ave=22.0, max=42, range=38)

#### Plot

In [26]:
fig, ax = plt.subplots(1, 1, figsize=(3, 3))
plot_dist(M.flatten(), ax, hist=False, shade=False, c='b', label='males')
plot_dist(F.flatten(), ax, hist=False, shade=False, c='r', label='females', save_file='plots/english/males_v_females_overall')
plot_dist(np.random.randn(1000)+2.12, ax, hist=False, shade=False, c='g', label='normal dist', save_file='plots/english/males_v_females_overall_w_random')
plt.show()

<IPython.core.display.Javascript object>

## Z-Test

claim: Males and Females have equally effective listening habits

H0 (claim): Males and Females have equally effective listening habits

H1: Males and Females have unequally effective listening habits

$\alpha$ = 0.10

critival value = 1.65

In [27]:
def calc_z_value(X0, X1):
    return ((X0.mean() - X1.mean()) / 
            math.sqrt(np.std(X0)**2/X0.size + 
                      np.std(X1)**2/X1.size))

In [28]:
z = calc_z_value(M, F)
z

-1.4050926805036306

Do not reject the null hypothesis, since -1.65 < 1.41 < 1.65

In [29]:
for i in range(10):
    z_i = calc_z_value(M[i], F[i])
    res = "Equally effective"
    if z_i < -1.65:
        res = "Females are better"
    elif z_i > 1.65:
        res = "Males are better"
    print("Habit #{}: \"{}\" at z_value of {} ".format(i+1, res, round(z_i, 2)))

Habit #1: "Females are better" at z_value of -1.75 
Habit #2: "Females are better" at z_value of -1.65 
Habit #3: "Equally effective" at z_value of 0.97 
Habit #4: "Equally effective" at z_value of -0.57 
Habit #5: "Equally effective" at z_value of 0.71 
Habit #6: "Equally effective" at z_value of -0.28 
Habit #7: "Equally effective" at z_value of 0.05 
Habit #8: "Equally effective" at z_value of -0.83 
Habit #9: "Equally effective" at z_value of -0.35 
Habit #10: "Equally effective" at z_value of -0.96 
