# Imports

In [57]:
import numpy as np
import pandas as pd
import scipy
from scipy.stats import chi2
import math

# Data Cleaning

In [58]:
part1 = pd.read_csv('final_part1 - Form Responses 1.csv')
part2 = pd.read_csv('final_part1 - Form Responses 2.csv')

In [59]:
didnt_know_cat_mapping = {'Registration data (name, email, number ...)': 'reg',
                         'Profile info (education, work, skills, photo, city ...)': 'profile',
                         'Posts and Uploads (anything you type in job quick apply content, salary survey, resume upload ...)':'uploads',
                         'Contents and News (articles, posts, comments mentioning you)':'contents',
                         'Contact and Calendar Info (LinkedIn messages, invites ...)':'contacts',
                         'Info Provided by Partner Companies (Microsoft and other LinkedIn branches)': 'partener_info',
                         'Service Use (clicking on ads, searches you performed, etc)':'service',
                         'Cookie Info (tracking device ID)':'cookie'}
data_awareness_categories = list(didnt_know_cat_mapping.keys())

In [60]:
part1.drop(columns={'Timestamp'}, inplace=True)
part1 = part1.rename(columns={'What is your age in years?':'age',
                     'Are you currently an undergraduate student?':'undergrad',
                     "Are you satisfied by LinkedIn's default privacy settings?":'current_concerned',
                     "How often do you access LinkedIn?":'access_frequency',
                    'Why do you go on LinkedIn?':'access_reason',
                     'Please come up with a unique nickname (how good are your jokes?), and write it down on a post-it! You will need this nickname for part 2 of the survey.':'nickname',
                     'Select if you did not know Linkedin collects:':'didnt_know_cat',
                     'If you left some of the checkboxes above blank, were you more concerned about your privacy after reading the full list of information LinkedIn collects from you?\n2 - Indifferent or you did not leave any boxes blank.':'after_concerned'})

# 1 if undergrad
# 0 otherwise
part1['undergrad'] = part1['undergrad'].apply(lambda x: 1 if x== 'Yes' else 0)

# 1 if access at least once every week
# 0 otherwise
part1['access_frequency'] = part1['access_frequency'].apply(lambda x: 1 if (x=='Multiple times each day' 
                                                                            or x=='Every day' 
                                                                            or x=='Every 2-3 days' 
                                                                            or x=='Every week') else 0)

part1['access_reason'] = part1['access_reason'].apply(lambda x: 'Other' 
                                                      if x 
                                                      not in ['To apply for jobs', 'To make new connections'] 
                                                      else x)

after_concerned_mapping = {1: 'more_concerned', 2: 'indifferent', 3: 'less_concerned'}
part1['after_concerned'] = part1['after_concerned'].map(after_concerned_mapping)

part1 = pd.get_dummies(part1,columns=['current_concerned','after_concerned','access_reason'])

null_list = part1['didnt_know_cat'].isnull()
part1['didnt_know_cat'][null_list] = 0
part1['didnt_know_cat'][~null_list] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part1['didnt_know_cat'][null_list] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part1['didnt_know_cat'][~null_list] = 1


In [61]:
part2.drop(columns={'Timestamp'}, inplace=True)
part2 = part2.rename(columns={'What was your nickname from part 1?':'nickname',
                              'Were you more concerned about your privacy after looking at the data LinkedIn collects from you?':'after_view_concerned'})
part2['after_view_concerned'] = part2['after_view_concerned'].map(after_concerned_mapping)

part2 = pd.get_dummies(part2,columns=['after_view_concerned'])

In [62]:
part1['nickname'] = part1['nickname'].str.replace(' ', '').astype(str)
part1['nickname'] = part1['nickname'].apply(lambda x: x.lower())
part2['nickname'] = part2['nickname'].str.replace(' ', '').astype(str)
part2['nickname'] = part2['nickname'].apply(lambda x: x.lower())

In [63]:
n_part1 = len(part1)
n_part2 = len(part2)
df = part1.merge(part2, on='nickname', how='left')
df

Unnamed: 0,age,undergrad,access_frequency,nickname,didnt_know_cat,current_concerned_I don't know enough about LinkedIn's privacy settings to say something,current_concerned_Not satisfied,current_concerned_Satisfied,current_concerned_Somewhat satisfied,after_concerned_indifferent,...,after_concerned_more_concerned,access_reason_Other,access_reason_To apply for jobs,access_reason_To make new connections,"If you were more concerned, which one of the files concerned you?","If you were less concerned, why?","If you were indifferent after seeing your data, why?",after_view_concerned_indifferent,after_view_concerned_less_concerned,after_view_concerned_more_concerned
0,20,1,0,mcscoober,1,1,0,0,0,1,...,0,0,1,0,,,,,,
1,20,1,1,itsame,1,1,0,0,0,0,...,1,0,0,1,messages,,,0.0,0.0,1.0
2,20,1,0,anglejangle,1,0,1,0,0,0,...,1,0,1,0,"Inferences_about_you, Job Applications",,,0.0,0.0,1.0
3,20,1,0,mc^2,1,1,0,0,0,0,...,1,1,0,0,,,,,,
4,23,1,1,jl,1,0,0,1,0,1,...,0,0,0,1,"Inferences_about_you, messages",,,1.0,0.0,0.0
5,21,1,1,zuzu,1,1,0,0,0,1,...,0,0,0,1,,,,,,
6,52,0,1,sunny,1,1,0,0,0,1,...,0,0,0,1,Inferences_about_you,Linked in job seeking site. Information disclo...,"To get a better job, information sharing is in...",1.0,0.0,0.0
7,20,1,0,helloworld!,1,0,0,0,1,1,...,0,0,0,1,,,,,,
8,21,1,1,blueberry,1,1,0,0,0,0,...,1,0,1,0,Inferences_about_you,,,0.0,0.0,1.0
9,20,1,1,kyber,1,0,0,0,1,0,...,1,0,0,1,,,,,,


In [64]:
df.columns

Index(['age', 'undergrad', 'access_frequency', 'nickname', 'didnt_know_cat',
       'current_concerned_I don't know enough about LinkedIn's privacy settings to say something',
       'current_concerned_Not satisfied', 'current_concerned_Satisfied',
       'current_concerned_Somewhat satisfied', 'after_concerned_indifferent',
       'after_concerned_less_concerned', 'after_concerned_more_concerned',
       'access_reason_Other', 'access_reason_To apply for jobs',
       'access_reason_To make new connections',
       'If you were more concerned, which one of the files concerned you?',
       'If you were less concerned, why?',
       'If you were indifferent after seeing your data, why?',
       'after_view_concerned_indifferent',
       'after_view_concerned_less_concerned',
       'after_view_concerned_more_concerned'],
      dtype='object')

# Resources
https://stats.libretexts.org/Bookshelves/Introductory_Statistics/Book%3A_OpenIntro_Statistics_(Diez_et_al)./06%3A_Inference_for_Categorical_Data
https://chaminade.edu/wp-content/uploads/2019/05/SAMPLE-IMPLIED-CONSENT-FOR-ONLINE-SURVEYS.pdf

# 1. Are users comfortable with the service's privacy defaults? i.e., do Company X users complain about the privacy terms?

In [65]:
for i in ['didnt_know_cat',
       "current_concerned_I don't know enough about LinkedIn's privacy settings to say something",
       'current_concerned_Not satisfied', 
          'current_concerned_Satisfied',
       'current_concerned_Somewhat satisfied']:
    print(df[i].sum())

30
17
3
5
10


In [70]:
print('Fraction not satisfied')
frac_notsat = round(np.sum(df['current_concerned_Not satisfied'])/n_part1, 3)
print(frac_notsat)
print('')
print('Fraction somewhat satisfied')
frac_somesat = round(np.sum(df['current_concerned_Somewhat satisfied'])/n_part1, 3)
print(frac_somesat)
print('')
print('Fraction satisfied')
frac_sat = round(np.sum(df['current_concerned_Satisfied'])/n_part1, 3)
print(frac_sat)
print('')
print('Fraction that is unsure')
frac_unsuresat = round(np.sum(df["current_concerned_I don't know enough about LinkedIn's privacy settings to say something"])/n_part1,3)
print(frac_unsuresat)

Fraction not satisfied
0.086

Fraction somewhat satisfied
0.286

Fraction satisfied
0.143

Fraction that is unsure
0.486


In [71]:
# Percentage calculation within 1-0.514 of the full sample
print(round(frac_notsat/(1-frac_unsuresat),3)*100)
print(round(frac_somesat/(1-frac_unsuresat),3)*100)
print(round(frac_sat/(1-frac_unsuresat),3)*100)

16.7
55.60000000000001
27.800000000000004


In [73]:
# Observed counts
observed_counts = {
    'current_concerned_Not satisfied': 3,
    'current_concerned_Satisfied': 5,
    'current_concerned_Somewhat satisfied': 10
}

# Expected counts
expected_counts = {
    'current_concerned_Not satisfied': 6,
    'current_concerned_Satisfied': 6,
    'current_concerned_Somewhat satisfied': 6
}

# Calculate the chi-square statistic
chi_square = sum([(observed_counts[key] - expected_counts[key])**2 / expected_counts[key] for key in observed_counts])

print("Chi-square value:", chi_square)

# Set the significance level (alpha) and degrees of freedom (df)
alpha = 0.05
degf = 2

# Calculate the critical value using the inverse CDF (percent point function)
critical_value = chi2.ppf(1 - alpha, degf)

print("Critical value for alpha =", alpha, "and degf =", degf, ":", critical_value)

Chi-square value: 4.333333333333333
Critical value for alpha = 0.05 and degf = 2 : 5.991464547107979


* Expected value at least 5 for each of the 3 categories
* Simple random sample within my friends and family group
* Each survey result is independent of each other so we can apply Chi-Square Goodness of Fit test. 

*Chi-squared goodness-of-fit test(non-parametric test): This test can be used to test whether the observed frequency distribution in one group follows a hypothesized distribution.

* H0: There is no bias in what one thinks of LinkedIn privacy defaults, and the survey results are chosen at random with a equal probability. Any deviations are fluctuations from sampling.
* HA: There is bias in survey choices.

The test statistic 𝑋2 should follow a chi-square distribution with k - 1 = 2 degrees of freedom if H0 is true.
 
Since the chi-square value (4.33) is less than the critical value (5.99) at a significance level of alpha = 0.05 and df = 2, we can to reject the null hypothesis. This suggests that there is significant difference between the observed and expected counts, and the data is not consistent with the assumption of no preference among the categories.

### Answer to part 1: Likely satisfied or somewhat satisfied
48.6% of the surveyed users were unsure about their current satisfaction regarding LinkedIn's privacy defaults. 8.6% were not satisfied, 28.6% were somewhat satisfied, and 14.3% were satisfied.

A Chi-Square goodness-of-fit test performed on the three categories (excluding the "unsure" category) suggests that the observed data is not consistent with the assumption of no preference among the categories. While the Chi-Square Goodness-of-Fit test does not indicate the direction in which the distribution is skewed, a qualitative observation leads me to believe that most users were somewhat satisfied with the privacy defaults, some users were satisfied, and finally few users were unsatisfied.

# 2. Are users aware of the data Company X collects about them?

In [66]:
for i in ['didnt_know_cat',
       "current_concerned_I don't know enough about LinkedIn's privacy settings to say something"]:
    print(df[i].sum())

30
17


In [83]:
# Independent and success-failure
se = np.sqrt((1/2*1/2)/n_part1)
point_estimate = 30/n_part1
null_value = 1/2
z_score = (point_estimate - null_value)/se
z_score
#The P-Value is .000012.
#The result is significant at p < .05.

4.225771273642582

In [84]:
# Independent and success-failure
se = np.sqrt((1/2*1/2)/n_part1)
point_estimate = 17/n_part1
null_value = 1/2
z_score = (point_estimate - null_value)/se
z_score
# The P-Value is .432898.
# The result is not significant at p < .05.

-0.1690308509457034

In [74]:
np.sum(df['didnt_know_cat'])/n_part1

0.8571428571428571

In [75]:
data_awareness_categories

['Registration data (name, email, number ...)',
 'Profile info (education, work, skills, photo, city ...)',
 'Posts and Uploads (anything you type in job quick apply content, salary survey, resume upload ...)',
 'Contents and News (articles, posts, comments mentioning you)',
 'Contact and Calendar Info (LinkedIn messages, invites ...)',
 'Info Provided by Partner Companies (Microsoft and other LinkedIn branches)',
 'Service Use (clicking on ads, searches you performed, etc)',
 'Cookie Info (tracking device ID)']

* the sample observations are independent and
* expected to see at least 10 successes and 10 failures in sample
* independence condition: simple random sample (each person in my friends/family group has equal chance of being chosen) and consist of less than 10% of the population (more than 350 people in my friends/family group)

So the sampling distribution of 𝑝̂ is nearly normal with mean p and standard error np.sqrt(𝑝(1−𝑝)𝑛)

### Answer to part 2: Users unaware about at least 1 of 8 data collection categories, but same cannot be said for general scope of LinkedIn privacy defaults
Of the 35 surveyed users, 30 did not know at least 1 of the 8 data collection categories. The success-failure condition, random condition, and independence condition are met, so I can make an inference for a normal distribution. Because the corresponding p-value(.000012) is smaller than 0.05, we can reject the null hypothesis, and we find convincing evidence for a lack of awareness among the surveyed users regarding at least one of the eight data collection categories.

Of the 35, 17 replied they don't know enough about LinkedIn's privacy policy to offer an opinion about LinkedIn privacy defaults. The corresponding p-value(.432898) is larger than 0.05, we fail to reject the null hypothesis, and we don't find convincing evidence for lack of awareness among the surveyed users regarding the general scope of LinkedIn privacy defaults.

# 3. Suppose the users did not know the data Company X collects about them. Would their privacy concerns be more serious after explaining to them that Company X does, indeed, collect data about them?

In [80]:
n_af = df['didnt_know_cat'].sum()
n_af

30

In [68]:
df[df['didnt_know_cat']==1][['after_concerned_less_concerned','after_concerned_indifferent','after_concerned_more_concerned']].sum()

after_concerned_less_concerned     5
after_concerned_indifferent       12
after_concerned_more_concerned    13
dtype: int64

In [79]:
# If data come from a simple random sample and consist of less than 10% of the population, 
# then the independence assumption is reasonable.
# For each hypothesis, success-failure condition is met because sample size = 30 and 
# proportion claimed in the null hypothesis (1/3) gives expected success 10 and failure 20
# With the conditions met, we are assured that the sampling distribution of 𝑝̂ is nearly normal.

In [81]:
se = np.sqrt((1/3*2/3)/n_af)
point_estimate = 13/30
null_value = 10/30
z_score = (point_estimate - null_value)/se

In [82]:
z_score

1.1618950038622256

p-value corresponding to significance level 0.05, one-tailed, and z_score 1.16 is 0.123024. We fail to reject the null hypothesis. Therefore, based on this analysis, we do not have sufficient evidence to conclude that the proportion has changed from the claimed proportion of 1/3.

* the sample observations are independent and
* expected to see at least 10 successes and 10 failures in sample
* independence condition: simple random sample (each person in my friends/family group has equal chance of being chosen) and consist of less than 10% of the population (more than 350 people in my friends/family group)

So the sampling distribution of 𝑝̂ is nearly normal with mean p and standard error np.sqrt(𝑝(1−𝑝)𝑛)

### Answer to part 3: no concrete evidence for increased concern
Of the 30 that did not know at least 1 of the 8 data collection categories, 13 replied that they were more concerned after reading about LinkedIn's data collection.

p-value corresponding to significance level 0.05, one-tailed, and z_score 1.16 is 0.123024. We fail to reject the null hypothesis. Therefore, we don't find convincing evidence for increased concern after reading about LinkedIn's data collection.

# 4. Will users' privacy concerns grow if they see the data Company X collects about each of them?

In [69]:
df[['after_view_concerned_less_concerned','after_view_concerned_indifferent','after_view_concerned_more_concerned']].sum()

after_view_concerned_less_concerned     2.0
after_view_concerned_indifferent       11.0
after_view_concerned_more_concerned     6.0
dtype: float64

In [85]:
# Observed counts
observed_counts = {
    'after_view_concerned_less_concerned': 2,
    'after_view_concerned_indifferent': 11,
    'after_view_concerned_more_concerned': 6
}

# Expected counts
expected_counts = {
    'after_view_concerned_less_concerned': 19/3,
    'after_view_concerned_indifferent': 19/3,
    'after_view_concerned_more_concerned': 19/3
}

# Calculate the chi-square statistic
chi_square = sum([(observed_counts[key] - expected_counts[key])**2 / expected_counts[key] for key in observed_counts])

print("Chi-square value:", chi_square)

# Set the significance level (alpha) and degrees of freedom (df)
alpha = 0.05
degf = 2

# Calculate the critical value using the inverse CDF (percent point function)
critical_value = chi2.ppf(1 - alpha, degf)

print("Critical value for alpha =", alpha, "and degf =", degf, ":", critical_value)

Chi-square value: 6.421052631578948
Critical value for alpha = 0.05 and degf = 2 : 5.991464547107979


* Expected value at least 5 for each of the 3 categories
* Simple random sample within my friends and family group
* Each survey result is independent of each other so we can apply Chi-Square Goodness of Fit test. 

* H0: There is no bias in how concerned one is after looking at their data, and the survey results are chosen at random with a equal probability. Any deviations are fluctuations from sampling.
* HA: There is bias in survey choices.

The test statistic 𝑋2 should follow a chi-square distribution with k - 1 = 2 degrees of freedom if H0 is true.

Since the chi-square value (6.42) is greater than the critical value (5.99) at a significance level of alpha = 0.05 and df = 2, we fail to reject the null hypothesis. This suggests that there is no significant difference between the observed data and expected counts assuming equal probability for each category.

### Answer to part 4: No concrete evidence for growing concern
Of 19 survey respondents that saw their LinkedIn data, 6 were more concerned, 11 were indifferent, and 2 were less concerned.

A Chi-Square goodness-of-fit test performed on the three categories suggests that the observed data is consistent with the assumption of no preference among the categories.