In [1]:
import glob
import dask.dataframe as dd
import pandas as pd
from pandarallel import pandarallel
import matplotlib.pyplot as plt
from datetime import datetime, timezone
import ast
import numpy as np
from operator import itemgetter
from stargazer.stargazer import Stargazer
import os
import datetime
from dateutil.rrule import rrule, MONTHLY, YEARLY, WEEKLY
from dateutil.relativedelta import relativedelta
from linearmodels.panel import PanelOLS
import multiprocessing
import statsmodels.formula.api as smf
import re
from itertools import product
import pytz
import smtplib 
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import time

In [2]:
pandarallel.initialize(progress_bar = False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
def readGlobDf(filename):
    df = pd.concat([pd.read_parquet(file, engine = 'pyarrow') for file in 
               glob.glob(filename.replace(".parquet","*.parquet"))])
    return df
df_actor = readGlobDf('data/merged_data/imputed_ranks/actor_login_id.parquet')


In [4]:
population = pd.read_parquet('data/intermediaries/population_copilot.parquet')

In [5]:
#population = population[population['stars_copilotrelease'] & population['stars_copilotrelease']<=3]

## Emails

In [6]:
def generateEmails( permission, project, identifier, link):    
    email = f"""
Hi!

First, I apologize if you have received this message multiple times. The SMTP server I have been using was experiencing bugs. 
    
My name is Chris and I am a student at UChicago writing an economics thesis about open source software. I have a few short questions to ask about your experience developing open source software, and the survey will not take more than a few minutes. If you do not recall the exact answer, I would still appreciate hearing your response, to the best of your ability. Participants will be entered in a raffle to win $10 gift cards.

1) Please enter this number ({int(identifier)}) as your identifier in Q1. 
2) The project referred to in Q2 is {project}. Q2 asks you about your level of access in {project} in June 2021. If it's helpful/jogs your memory, my data indicates that you had {permission} access, but this may not be correct.
The survey link is {link}. 

If you would also like to share a few thoughts about your experience using Github Copilot (if applicable) in open source development, it would be great to hear them. If you would like to chat more about your experience using Github Copilot, let me know and we can set up a time.
Thank you for your time!

Best,

Chris Liao
https://www.linkedin.com/in/chris-liao-8865b219a
"""

    return email

In [7]:
top_python = pd.read_csv('data/inputs/top_python_stars.csv').rename({'id':'repo_id'}, axis = 1)
top_python_grouped = top_python.groupby('repo_id')['watch_count'].sum().reset_index()

In [8]:
top_os_python_repos = top_python_grouped.sort_values('watch_count').tail(1000)['repo_id'].tolist()

In [9]:
# people who should have copilot
should_have_copilot = population[(population['repo_id'].isin(top_os_python_repos)) & (population['permissions'].isin(['write','owner'])) & \
    (population['Free_Copilot'] == 1) & (population['created_at']<datetime.datetime(2021, 6, 23, tzinfo = pytz.UTC ))].sort_values(
    ['stars_copilotrelease', 'permissions_ranked'])[['actor_id', 'Actual Name', 'emails', 'repo_id', 'permissions', 'repo_name']].drop_duplicates(['actor_id']).dropna(
    thresh = 1).sample(frac = 1)
should_have_copilot = should_have_copilot[~should_have_copilot['emails'].isna()]
# people who do not have copilot - permissions
no_copilot_perms = population[(population['repo_id'].isin(top_os_python_repos)) & \
    ~(population['actor_id'].isin(should_have_copilot['actor_id'].tolist())) & (population['Free_Copilot'] == 0) & \
    (population['permissions'].isin(['triage']))  & (population['created_at']<datetime.datetime(2021, 6, 23, tzinfo = pytz.UTC ))].sort_values(
    ['stars_copilotrelease', 'permissions_ranked'])[['actor_id', 'Actual Name', 'emails', 'repo_id', 'permissions', 'repo_name']].drop_duplicates(['actor_id']).dropna(
    thresh = 1).sample(frac = 1)
no_copilot_perms = no_copilot_perms[~no_copilot_perms['emails'].isna()]
# people who do not have copilot - project
no_copilot_project = population[~(population['repo_id'].isin(top_os_python_repos)) & \
    ~(population['actor_id'].isin(should_have_copilot['actor_id'].tolist())) & (population['Free_Copilot'] == 0) & \
    ~(population['actor_id'].isin(no_copilot_perms['actor_id'].tolist())) & (population['permissions'].isin(['write','owner'])) \
    & (population['created_at']<datetime.datetime(2021, 6, 23, tzinfo = pytz.UTC ))].sort_values(
    ['stars_copilotrelease', 'permissions_ranked'])[['actor_id', 'Actual Name', 'emails', 'repo_id', 'permissions', 'repo_name']].drop_duplicates(['actor_id']).dropna(
    thresh = 1).sample(frac = 1)
no_copilot_project = no_copilot_project[~no_copilot_project['emails'].isna()]

In [10]:
should_have_copilot['email'] = should_have_copilot.apply(
    lambda x: generateEmails(x['permissions'], x['repo_name'],
                             int(x['actor_id']),  "https://forms.gle/DT3fycAQgz6HfGu19"), axis = 1)
no_copilot_perms['email'] = no_copilot_perms.apply(
    lambda x: generateEmails(x['permissions'], x['repo_name'],
                             int(x['actor_id']),  "https://forms.gle/DT3fycAQgz6HfGu19"), axis = 1)
no_copilot_project['email'] = no_copilot_project.apply(
    lambda x: generateEmails(x['permissions'], x['repo_name'],
                             int(x['actor_id']),  "https://forms.gle/DT3fycAQgz6HfGu19"), axis = 1)

In [11]:
should_have_copilot.to_parquet('should_have_copilot.parquet')
no_copilot_project.to_parquet('no_copilot_project.parquet')

In [12]:
should_have_copilot = pd.read_parquet('should_have_copilot.parquet')
no_copilot_project = pd.read_parquet('no_copilot_project.parquet')

In [13]:
exclude = ['methane', 'ilyan', 'Zac-HD', 'jeffkaufman', 'joshfriend', 'heyman', 'cyberw', 'hhatto', 
           'liZe', 'adiroiban', 'amoffat', 'deckar01', 'mwaskom', 'oscarbenjamin', 'bwoodsend', 'wRAR', 
           'jeremiedbb', 'danyeaw', 'gaborbernat', 'stefanv', 'mhils', 'BoboTiG', 'oscargus', 'aclark4life', 
           'kingosticks', 'coleifer', 'deckar01', 'sloria', 'PatMyron',]
exclude_lwr = [ele.lower() for ele in exclude]

In [14]:
df_actor['actor_login_lwr'] = df_actor['actor_login'].apply(lambda x: x.lower())
exclude_ids = df_actor[df_actor['actor_login_lwr'].isin(exclude_lwr)].drop_duplicates()['actor_id'].tolist()

In [15]:
df_export = should_have_copilot.loc[should_have_copilot.index[60:]]
df_export = df_export[~df_export['actor_id'].isin(exclude_ids)]
df_export = pd.concat([df_export[['emails', 'email']], no_copilot_perms[['emails', 'email']],
                       no_copilot_project[['emails', 'email']]])
df_export['addresses'] = df_export['emails'].apply(lambda x: ", ".join(x))
df_export[['addresses','email']].to_csv('survey_emails.csv')

In [16]:
print("Inquiry about Participation as Open Source Developer")

Inquiry about Participation as Open Source Developer


In [17]:
"""ind = should_have_copilot.index[i]
i+=1
print(i)
if should_have_copilot.loc[ind, 'actor_id'] not in exclude_ids:
    print(", ".join(should_have_copilot.loc[ind, 'emails']))
    print(generateEmails(should_have_copilot.loc[ind, 'permissions'], 
                         should_have_copilot.loc[ind, 'repo_name'], 
                         should_have_copilot.loc[ind, 'actor_id'], "https://forms.gle/DT3fycAQgz6HfGu19"))"""

'ind = should_have_copilot.index[i]\ni+=1\nprint(i)\nif should_have_copilot.loc[ind, \'actor_id\'] not in exclude_ids:\n    print(", ".join(should_have_copilot.loc[ind, \'emails\']))\n    print(generateEmails(should_have_copilot.loc[ind, \'permissions\'], \n                         should_have_copilot.loc[ind, \'repo_name\'], \n                         should_have_copilot.loc[ind, \'actor_id\'], "https://forms.gle/DT3fycAQgz6HfGu19"))'

## Controls

In [18]:
i=105

In [19]:
server_response_should_have_copilot = []
project_no_copilot = []

In [20]:
gmail_server='smtp.cs.uchicago.edu'
gmail_port=465

In [21]:
#my_email='chrisliao@uchicago.edu'
#password_key=#

In [22]:
"""with smtplib.SMTP_SSL(gmail_server, gmail_port) as server:
    server.ehlo()
    server.login(my_email, password_key)

    while i < len(should_have_copilot.index):
        ind = should_have_copilot.index[i]
        i+=1
        og_email_list = should_have_copilot.loc[ind, 'emails']
        email_list = [email for email in og_email_list if "noreply.github" not in email]
        if len(email_list)<7 and len(email_list)>0 and should_have_copilot.loc[ind, 'actor_id'] not in exclude_ids:
            message = MIMEMultipart("alternative")
            message["Subject"] = "Inquiry about Participation as Open Source Developer"
            message["From"] = "chrisliao@uchicago.edu"
            message["To"] = ", ".join(email_list)
            
            # Create the plain-text and HTML version of your message
            text = should_have_copilot.loc[ind, 'email']
            # Turn these into plain/html MIMEText objects
            part1 = MIMEText(text, "plain")
            
            # Add HTML/plain-text parts to MIMEMultipart message
            # The email client will try to render the last part first
            message.attach(part1)
            server_response = server.send_message(message)
            server_response_should_have_copilot.append([ind, server_response])
            print(server_response, email_list)
            time.sleep(1)
        else:
            print(f"no sending because we have {len(email_list)} emails")"""

'with smtplib.SMTP_SSL(gmail_server, gmail_port) as server:\n    server.ehlo()\n    server.login(my_email, password_key)\n\n    while i < len(should_have_copilot.index):\n        ind = should_have_copilot.index[i]\n        i+=1\n        og_email_list = should_have_copilot.loc[ind, \'emails\']\n        email_list = [email for email in og_email_list if "noreply.github" not in email]\n        if len(email_list)<7 and len(email_list)>0 and should_have_copilot.loc[ind, \'actor_id\'] not in exclude_ids:\n            message = MIMEMultipart("alternative")\n            message["Subject"] = "Inquiry about Participation as Open Source Developer"\n            message["From"] = "chrisliao@uchicago.edu"\n            message["To"] = ", ".join(email_list)\n            \n            # Create the plain-text and HTML version of your message\n            text = should_have_copilot.loc[ind, \'email\']\n            # Turn these into plain/html MIMEText objects\n            part1 = MIMEText(text, "plain")\n

In [23]:
i=0

In [24]:
i

0

In [25]:
"""with smtplib.SMTP_SSL(gmail_server, gmail_port) as server:
    server.ehlo()
    server.login(my_email, password_key)
    while i < len(no_copilot_project.index):
        ind = no_copilot_project.index[i]
        i+=1
        og_email_list = no_copilot_project.loc[ind, 'emails']
        email_list = [email for email in og_email_list if "noreply.github" not in email]
        if len(email_list)<7 and len(email_list)>0:
            message = MIMEMultipart("alternative")
            message["Subject"] = "Inquiry about Participation as Open Source Developer"
            message["From"] = "chrisliao@uchicago.edu"
            message["To"] = ", ".join(email_list)
            # Create the plain-text and HTML version of your message
            text = no_copilot_project.loc[ind, 'email']
            # Turn these into plain/html MIMEText objects
            part1 = MIMEText(text, "plain")
            # Add HTML/plain-text parts to MIMEMultipart message
            # The email client will try to render the last part first
            message.attach(part1)
            server_response = server.send_message(message)
            project_no_copilot.append([ind, server_response])
            print(server_response, email_list)
            time.sleep(1)
        else:
            print(f"no sending because we have {len(email_list)} emails")"""

'with smtplib.SMTP_SSL(gmail_server, gmail_port) as server:\n    server.ehlo()\n    server.login(my_email, password_key)\n    while i < len(no_copilot_project.index):\n        ind = no_copilot_project.index[i]\n        i+=1\n        og_email_list = no_copilot_project.loc[ind, \'emails\']\n        email_list = [email for email in og_email_list if "noreply.github" not in email]\n        if len(email_list)<7 and len(email_list)>0:\n            message = MIMEMultipart("alternative")\n            message["Subject"] = "Inquiry about Participation as Open Source Developer"\n            message["From"] = "chrisliao@uchicago.edu"\n            message["To"] = ", ".join(email_list)\n            # Create the plain-text and HTML version of your message\n            text = no_copilot_project.loc[ind, \'email\']\n            # Turn these into plain/html MIMEText objects\n            part1 = MIMEText(text, "plain")\n            # Add HTML/plain-text parts to MIMEMultipart message\n            # The em

In [26]:
actor_dict = df_actor[['actor_id','actor_login']].drop_duplicates().set_index('actor_login').to_dict()['actor_id']

In [27]:
survey_response = pd.read_csv("Open Source Copilot Survey (Responses) - Form Responses 1.csv")
survey_response.columns = ['timestamp','actor_id','permission','copilot_access','copilot_usage']
survey_response['actor_id'] = survey_response['actor_id'].apply(lambda x: x.split("/")[0])
survey_response['actor_id'] = survey_response['actor_id'].apply(lambda x: actor_dict.get(x, x))
survey_response['actor_id'] = pd.to_numeric(survey_response['actor_id'], errors = 'coerce')

In [28]:
include_cats = ['Received as student/teacher.','Access through work', 'I have access to Github Copilot because I work for Microsoft',
                'University Staff','yes,  but not from the project refererred to in the email',
                'Received earlier access from a friend on the team at the time','Yes','No']

In [29]:
have_copilot_results = pd.merge(should_have_copilot, survey_response)
have_copilot_results_yn = have_copilot_results[have_copilot_results['copilot_access'].isin(include_cats)]
pd.concat([have_copilot_results_yn[['copilot_access', 'copilot_usage']].value_counts().sort_index(),
           have_copilot_results_yn[['copilot_access', 'copilot_usage']].value_counts(normalize = True).sort_index()], axis = 1).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,proportion
copilot_access,copilot_usage,Unnamed: 2_level_1,Unnamed: 3_level_1
No,No,15,0.28
No,Yes,4,0.08
Received as student/teacher.,No,1,0.02
Received as student/teacher.,Yes,1,0.02
Yes,No,21,0.4
Yes,Yes,11,0.21


In [30]:
no_copilot_results = pd.merge(no_copilot_project, survey_response)
no_copilot_results_yn = no_copilot_results[no_copilot_results['copilot_access'].isin(include_cats)]
pd.concat([no_copilot_results_yn[['copilot_access', 'copilot_usage']].value_counts().sort_index(),
           no_copilot_results_yn[['copilot_access', 'copilot_usage']].value_counts(normalize = True).sort_index()], axis = 1).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,proportion
copilot_access,copilot_usage,Unnamed: 2_level_1,Unnamed: 3_level_1
Access through work,Yes,1,0.01
I have access to Github Copilot because I work for Microsoft,Yes,1,0.01
No,No,47,0.44
No,Yes,23,0.22
Received as student/teacher.,Yes,5,0.05
University Staff,Yes,1,0.01
Yes,No,16,0.15
Yes,Yes,11,0.1
"yes, but not from the project refererred to in the email",No,1,0.01


In [31]:
t = survey_response[survey_response['copilot_access'].isin(['No','Yes','Unsure'])][
    ['copilot_access','copilot_usage']].value_counts(normalize = True).sort_index().round(3)
print("True Treatment takeup rate: {:.2f}".format(t.loc[('Yes','Yes')]/(t.loc[('Yes','Yes')] + t.loc[('Yes','No')])))
print("True Control takeup rate: {:.2f}".format(t.loc[('No','Yes')]/(t.loc[('No','Yes')] + t.loc[('No','No')])))

True Treatment takeup rate: 0.39
True Control takeup rate: 0.32


In [32]:
print("Treatment Group Treated: {:.2f}, Control Group Treated: {:.2f}".format(
    np.mean(have_copilot_results_yn['copilot_access'] == 'Yes'), np.mean(no_copilot_results_yn['copilot_access'] == 'Yes')))
print("Treatment Group Takeup: {:.2f}, Control Group Takeup: {:.2f}".format(
    np.mean(have_copilot_results_yn['copilot_usage'] == 'Yes'), np.mean(no_copilot_results_yn['copilot_usage'] == 'Yes')))

Treatment Group Treated: 0.60, Control Group Treated: 0.25
Treatment Group Takeup: 0.30, Control Group Takeup: 0.40
