In [22]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

npi = pd.read_csv('narcissistic_personality_inventory.csv')

print(npi.head(5))

# influence: yes = I have a natural talent for influencing people; no = I am not good at influencing people.
# blend_in: yes = I prefer to blend in with the crowd; no = I like to be the center of attention.
# special: yes = I think I am a special person; no = I am no better or worse than most people.
# leader: yes = I see myself as a good leader; no = I am not sure if I would make a good leader.
# authority: yes = I like to have authority over other people; no = I don’t mind following orders.


   index influence blend_in special leader authority
0      0        no      yes     yes    yes       yes
1      1        no      yes      no     no        no
2      2       yes       no     yes    yes       yes
3      3       yes       no      no    yes       yes
4      4       yes      yes      no    yes        no


### Is there an association between influence and leader?   

In [23]:
influence_leader_freq = pd.crosstab(npi.influence, npi.leader)
print(influence_leader_freq)

leader       no   yes
influence            
no         1613   705
yes        1301  2380


In [24]:
# The previous table as proportions
print(len(npi))
influence_leader_prop = influence_leader_freq/len(npi)
print(influence_leader_prop)

5999
leader           no       yes
influence                    
no         0.268878  0.117520
yes        0.216869  0.396733


## Marginal proportions

Marginal Proportion -> The proportion of respondents in each category of a single question.

In [25]:
leader_marginals = influence_leader_prop.sum(axis=0)
print(leader_marginals)

print()

influence_marginals = influence_leader_prop.sum(axis=1)
# no: 0.268878 + 0.117520 = 0.386398
# yes: 0.216869 + 0.396733 = 0.613602
print(influence_marginals)

# Summary
# While respondents are approximately split on whether they see themselves as a leader, more people think they have a talent for influencing people than not.

leader
no     0.485748
yes    0.514252
dtype: float64

influence
no     0.386398
yes    0.613602
dtype: float64


### Expected Contingency Tables

This one's complicated. We use the marginal proportion to create a contingency table of *expected proportions* if there we *no association* between these variables.

This is how it's explained at Codeacademy.

<<img src="05c_contingency_table.png" height="500" />

In [26]:
chi2, pval, dof, expected = chi2_contingency(influence_leader_freq)
print(np.round(expected))
# Codeacademy got the same results as the screenshot:
# [[2087, 2221]
# [3288, 3501]]
# ...but our results are different, probaly due to the difference in our datasets.

# From Codeacademy:
# The more that the expected and observed tables differ, the more sure we can be that the variables are associated. In this example, we see some pretty big differences (eg., 3015 in the observed table compared to 2087 in the expected table). This provides additional evidence that these variables are associated.

[[1126. 1192.]
 [1788. 1893.]]


### Chi-Square Statistic

ChiSquare = sum((observed - expected)^2)

In [21]:
chi2, pval, dof, expected = chi2_contingency(influence_leader_freq)
print(chi2)

666.2609777674471
