In [None]:
import requests
import csv
import time

# GitHub API token
GITHUB_TOKEN = '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'
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Zurich", min_followers=50):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

if __name__ == "__main__":
    main()

Fetching users...
Saved 475 users to users.csv
Fetching repositories...
Fetched 61 repositories for user IDouble
Fetched 39 repositories for user TheOfficialFloW
Fetched 259 repositories for user Seldaek
Fetched 58 repositories for user riscv
Fetched 238 repositories for user JonnyBurger
Fetched 127 repositories for user bpasero
Fetched 84 repositories for user egamma
Fetched 448 repositories for user ethz-asl
Fetched 189 repositories for user sahildua2305
Fetched 160 repositories for user joaomoreno
Fetched 28 repositories for user klaudiosinani
Fetched 28 repositories for user sbrannen
Fetched 29 repositories for user Juriy
Fetched 38 repositories for user sarlinpe
Fetched 24 repositories for user sustrik
Fetched 28 repositories for user LorenzMeier
Fetched 64 repositories for user jwagner
Fetched 151 repositories for user jaspervdj
Fetched 177 repositories for user lsmith77
Fetched 114 repositories for user videlalvaro
Fetched 44 repositories for user cvg
Fetched 224 repositories fo

**Q1. Who are the top 5 users in Zurich with the highest number of followers? List their login in order, comma-separated.**

In [None]:
import csv

# Define the list to store users from zurich
users_in_zurich = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
        location = item['location'].strip().lower()
        # Check if the user is from zurich
        if 'zurich' in location:
            users_in_zurich.append({
                'login': item['login'],
                'followers': int(item['followers'])
            })

# Sort users based on followers in descending order
top_users = sorted(users_in_zurich, key=lambda x: x['followers'], reverse=True)

# Extract the top 5 user logins
top_5_logins = [user['login'] for user in top_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_logins))

IDouble,TheOfficialFloW,Seldaek,riscv,JonnyBurger


**Q2. Who are the 5 earliest registered GitHub users in Zurich? List their login in ascending order of created_at, comma-separated.**

In [None]:
import csv
from datetime import datetime

# Define the list to store users from zurich
users_in_zurich = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from zurich
        if 'zurich' in location:
            users_in_zurich.append({
                'login': row['login'],
                'created_at': datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            })

# Sort users based on created_at in ascending order
sorted_users = sorted(users_in_zurich, key=lambda x: x['created_at'])

# Extract the top 5 user logins
top_5_earliest_logins = [user['login'] for user in sorted_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_earliest_logins))


lejoe,uwolfer,matthiask,oscardelben,panterch


Q**3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.**

In [None]:
import csv
from collections import Counter

#define a list called licenses to store all license names
licenses = []

#read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      license_name=item['license_name']
      if license_name:
        licenses.append(license_name)

#count the frequency of each license
license_counts = Counter(licenses)
#find the 3 most popular licenses
most_popular_licenses = license_counts.most_common(3)

#print the result as a comma seperated list
print(','.join([license_name for license_name, _ in most_popular_licenses]))

mit,other,apache-2.0


**Q4. Which company do the majority of these developers work at? (cleaned up as explained above)**

In [None]:
import csv

#define a list called company to store all the company names
company=[]
#read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      company_name=item['company']
      if company_name:
        company.append(company_name)
#count the frequency at each company
company_counts = {}
for company_name in company:
    if company_name in company_counts:
        company_counts[company_name] += 1
    else:
        company_counts[company_name] = 1
#find the company with the majority of these developers work at
company_with_most_developers = max(company_counts, key=company_counts.get)

#print the result
print(company_with_most_developers)


GOOGLE


**Q5. Which programming language is most popular among these users?**

In [None]:
import csv
from collections import Counter

#define a list called programming_languages to store all the programming_language names
programming_languages=[]
#read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      programming_language=item['language']
      if programming_language:
        programming_languages.append(programming_language)
#count the frequency of each programming language
language_counts = Counter(programming_languages)
#find the programming language with the highest frequency
most_popular_language = language_counts.most_common(1)[0][0]

#print the result
print(most_popular_language)


Python


**Q6. Which programming language is the second most popular among users who joined after 2020?**

In [None]:
import csv
from collections import Counter
from datetime import datetime

#define a list called programming_languages to store all the programming_language names
programming_languages=[]
#read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      created_at=item['created_at']

      # Convert the date string to a datetime object
      if created_at:
        created_at_date=datetime.strptime(created_at, '%Y-%m-%dT%H:%M:%SZ')

      # Check if the user joined after 2020
      if created_at_date.year>2020:
        programming_language=item['language']
        if programming_language:
          programming_languages.append(programming_language)
#count the frequency of each programming language
language_counts = Counter(programming_languages)
#find the programming language with the second highest frequency
second_most_popular_language = language_counts.most_common(2)[1][0]

#print the result
print(second_most_popular_language)

TypeScript


**Q7. Which language has the highest average number of stars per repository?**

In [None]:
import csv
from collections import defaultdict

#define a dictionary called language_stars to store the number of stars for each language
language_stars = defaultdict(list)

#read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      language=item['language']
      stargazers_count=int(item['stargazers_count'])
      if language:
        language_stars[language].append(stargazers_count)
#calculate the average number of stars per repository for each language
language_averages = {language: sum(stars) / len(stars) for language, stars in language_stars.items()}

#language with highest average number of stars per repository
highest_average_language = max(language_averages, key=language_averages.get)

#print the result
print(highest_average_language)

BitBake


**Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.**

In [None]:
import csv

#define a list called leader_strength to store the leader_strength of each
leader_strengths=[]
#read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for item in reader:
      followers=int(item['followers'])
      following=int(item['following'])
      #find the leader strengeth
      leader_strength=followers/(1+following)
      leader_strengths.append((item.get('login'),leader_strength))

# Sort users by leader strength in descending order
leader_strengths.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 users
top_5_leaders = [login for login, strength in leader_strengths[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_leaders))



riscv,bpasero,Seldaek,egamma,ethz-asl


**Q9. What is the correlation between the number of followers and the number of public repositories among users in Zurich?**

In [None]:
import csv
import numpy as np

#define followers list and public_repos list to store the no.of followers and public repositories respectively
followers=[]
public_repos=[]
#read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for item in reader:
      location = item['location'].strip().lower()
      # Check if the user is from zurich
      if 'zurich' in location:
        followers.append(int(item['followers']))
        public_repos.append(int(item['public_repos']))

# Calculate the correlation coefficient
correlation = np.corrcoef(followers, public_repos)[0, 1]

# Print the result
print(f"{correlation:.3f}")




0.066


**Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository**

In [None]:
import csv
from sklearn.linear_model import LinearRegression

#define followers list and public_repos list to store the no.of followers and public repositories respectively
followers=[]
public_repos=[]

#read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for item in reader:
      followers.append(int(item['followers']))
      public_repos.append(int(item['public_repos']))
# Perform linear regression: followers ~ public_repos
X = np.array(public_repos).reshape(-1, 1)
y = np.array(followers)
model = LinearRegression()
model.fit(X, y)

# Extract the coefficient and intercept
coefficient = model.coef_[0]
intercept = model.intercept_

# Print the result
print(f"{coefficient:.3f}")

1.473


**Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?**

In [None]:
import pandas as pd

#load the data
repositories_df=pd.read_csv('repositories.csv')

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repositories_df['has_projects'].astype(int).corr(repositories_df['has_wiki'].astype(int))

# Print the result
print(f"{correlation:.3f}")

0.310


**Q12. Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest**

In [None]:
import pandas as pd

def analyze_following_difference(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate average following for hireable users
    hireable_following = df[df['hireable'] == True]['following'].mean()

    # Calculate average following for non-hireable users
    non_hireable_following = df[df['hireable'] != True]['following'].mean()

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_following - non_hireable_following, 3)

    # Print debug information
    print(f"Number of hireable users: {len(df[df['hireable'] == True])}")
    print(f"Number of non-hireable users: {len(df[df['hireable'] != True])}")
    print(f"Average following for hireable users: {hireable_following:.3f}")
    print(f"Average following for non-hireable users: {non_hireable_following:.3f}")

    return difference

# Calculate the difference
result = analyze_following_difference()
print(f"\nDifference in average following: {result:.3f}")

Number of hireable users: 103
Number of non-hireable users: 372
Average following for hireable users: 75.728
Average following for non-hireable users: 909.801

Difference in average following: -834.073


**Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count**

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# The error was here: users_with_bio was used instead of users_with_bios
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.3f}')

Regression slope of followers on bio word count: 40.518


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


**Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated**

In [None]:
import csv
from datetime import datetime
from collections import Counter

# Counter to store the no.of repositories on weekends
weekend_repo_counts = Counter()

# Open the repositories.csv file and read data
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at_str = row.get('created_at', '')
        if created_at_str:
            created_at = datetime.strptime(created_at_str, '%Y-%m-%dT%H:%M:%SZ')
            if created_at.weekday() >= 5:  # Saturday and Sunday
                login = row.get('login', '')
                weekend_repo_counts[login] += 1

# Find the top 5 users with the most repositories on weekends
top_5_users = weekend_repo_counts.most_common(5)

#print the results
print(','.join([user for user, _ in top_5_users]))


syzer,JonnyBurger,kynan,nicnocquee,shuhei


**Q15. Do people who are hireable share their email addresses more often?[fraction of users with email when hireable=true] minus [fraction of users with email for the rest]**

In [None]:
import pandas as pd

def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)

    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')

    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    # Calculate difference and round to 3 decimal places
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    # Print debug information
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

# Read and analyze the complete dataset
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 475
Hireable users with email: 55/103
Non-hireable users with email: 174/372
Hireable fraction: 0.534
Non-hireable fraction: 0.468

Final result: 0.066


**Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)**

In [None]:
import csv
from collections import Counter

# Counter to store surname frequencies
surname_counter = Counter()

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        name = row.get('name', '').strip()
        if name:  # Ignore missing names
            # Split the name by whitespace and get the last word as the surname
            surname = name.split()[-1]
            surname_counter[surname] += 1

# Find the maximum frequency of surnames
if surname_counter:
    max_count = max(surname_counter.values())
    # Get all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Output the result
    print(f"{','.join(most_common_surnames)}: {max_count}")
else:
    print("No names found.")

Li,Wang: 4
