# Examining Relationships Between Categorical Variables

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import chi2_contingency

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,score,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q34,Q35,Q36,Q37,Q38,Q39,Q40,elapse,gender,age
0,18,2,2,2,2,1,2,1,2,2,...,1,1,2,2,2,1,2,211,1,50
1,6,2,2,2,1,2,2,1,2,1,...,2,1,2,2,2,2,1,149,1,40
2,27,1,2,2,1,2,1,2,1,2,...,1,2,1,1,2,1,2,168,1,28
3,29,1,1,2,2,2,1,2,1,1,...,1,2,1,2,2,1,1,230,1,37
4,6,1,2,1,1,1,2,1,2,1,...,2,1,2,2,2,0,1,389,1,50


It would be helpful to look through the documentation of this dataset and narrow the number of questions to examine.

Q1: Influencing people (1 = yes, 2 = no)\
Q2: Modesty (1 = no, 2 = yes)\
Q7: Blend in (1 = yes, 2 = no)\
Q12: Authority (1 = yes, 2 = no)

In [47]:
df.columns

Index(['score', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10',
       'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30',
       'Q31', 'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40',
       'elapse', 'gender', 'age'],
      dtype='object')

In [48]:
npi = df[['Q1', 'Q2', 'Q7', 'Q12']]
npi.head()

Unnamed: 0,Q1,Q2,Q7,Q12
0,2,2,1,1
1,2,2,1,2
2,1,2,2,1
3,1,1,2,1
4,1,2,1,2


In [49]:
column_mapper = {'Q1': 'influence', 'Q2': 'modesty', 'Q7': 'blend_in', 'Q12': 'authority'}
npi = npi.rename(mapper = column_mapper, axis = 1)
npi.head()

Unnamed: 0,influence,modesty,blend_in,authority
0,2,2,1,1
1,2,2,1,2
2,1,2,2,1
3,1,1,2,1
4,1,2,1,2


In [50]:
npi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11243 entries, 0 to 11242
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   influence  11243 non-null  int64
 1   modesty    11243 non-null  int64
 2   blend_in   11243 non-null  int64
 3   authority  11243 non-null  int64
dtypes: int64(4)
memory usage: 351.5 KB


In [51]:
npi = npi.astype('str')
npi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11243 entries, 0 to 11242
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   influence  11243 non-null  object
 1   modesty    11243 non-null  object
 2   blend_in   11243 non-null  object
 3   authority  11243 non-null  object
dtypes: object(4)
memory usage: 351.5+ KB


In [52]:
npi['influence'] = npi['influence'].replace({'1': 'yes', '2': 'no'})
npi['modesty'] = npi['modesty'].replace({'1': 'no', '2': 'yes'})
npi['blend_in'] = npi['blend_in'].replace({'1': 'yes', '2': 'no'})
npi['authority'] = npi['authority'].replace({'1': 'yes', '2': 'no'})
npi.head()

Unnamed: 0,influence,modesty,blend_in,authority
0,no,yes,yes,yes
1,no,yes,yes,no
2,yes,yes,no,yes
3,yes,no,no,yes
4,yes,yes,yes,no


In [53]:
npi.influence.value_counts()

yes    6872
no     4354
0        17
Name: influence, dtype: int64

In [54]:
npi = npi[npi['influence'] != '0']
npi = npi[npi['modesty'] != '0']
npi = npi[npi['blend_in'] != '0']
npi = npi[npi['authority'] != '0']

### Let's see if 'Modesty' and 'Blending In' are associated.

In [55]:
# Observed contingency table

modesty_blend_freq = pd.crosstab(npi['modesty'], npi['blend_in'])
print(modesty_blend_freq)

blend_in    no   yes
modesty             
no        1301  1065
yes       2107  6672


In [56]:
# Observed contingency table proportions

modesty_blend_prop = modesty_blend_freq / len(npi)
print(modesty_blend_prop)

blend_in        no       yes
modesty                     
no        0.116734  0.095559
yes       0.189053  0.598654


In [60]:
# Marginal proportions

blend_marginals = modesty_blend_prop.sum(axis = 0)
modesty_marginals = modesty_blend_prop.sum(axis = 1)
print(blend_marginals)
print()
print(modesty_marginals)

blend_in
no     0.305787
yes    0.694213
dtype: float64

modesty
no     0.212293
yes    0.787707
dtype: float64


In [57]:
# Expected contingency table

chi2, pval, dof, expected = chi2_contingency(modesty_blend_freq)
print(np.round(expected))

[[ 723. 1643.]
 [2685. 6094.]]


In [58]:
print(chi2)

841.5319523703167


The Chi-Squared statistic for the 'Modesty' and 'Blending In' attributes strongly suggests an association between the two.

### Now let's see if 'Blending In' and 'Authority' are associated.

In [62]:
# Observed contingency table

blend_authority_freq = pd.crosstab(npi['blend_in'], npi['authority'])
print(blend_authority_freq)

authority    no   yes
blend_in             
no         1310  2098
yes        5015  2722


In [64]:
# Observed contingency table proportions

blend_authority_prop = blend_authority_freq / len(npi)
print(blend_authority_prop)

authority        no       yes
blend_in                     
no         0.117541  0.188246
yes        0.449978  0.244235


In [66]:
# Marginal proportions

authority_marginals = blend_authority_prop.sum(axis = 0)
blend_marginals = blend_authority_prop.sum(axis = 1)
print(authority_marginals)
print()
print(blend_marginals)

authority
no     0.567519
yes    0.432481
dtype: float64

blend_in
no     0.305787
yes    0.694213
dtype: float64


In [68]:
# Expected contingency table

chi2, pval, dof, expected = chi2_contingency(blend_authority_freq)
print(np.round(expected))

[[1934. 1474.]
 [4391. 3346.]]


In [69]:
print(chi2)

669.6989478223078


The Chi-Squared statistic for the 'Blending In' and 'Authority' attributes strongly suggests an association between the two.