In [2]:
import pandas as pd
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

In [3]:
# 1. Top 5 users in Sydney with the highest number of followers
top_5_followers = users_df.nlargest(5, 'followers')['login'].tolist()
print('Top 5 users with highest followers:', ', '.join(top_5_followers))

Top 5 users with highest followers: nicknochnack, brendangregg, cornflourblue, 0vm, davecheney


In [4]:
# 2. Earliest 5 users
earliest_5_users = users_df.sort_values(by='created_at').head(5)['login'].tolist()
print('Earliest 5 users:', ', '.join(earliest_5_users))

Earliest 5 users: dylanegan, cjheath, freshtonic, dhowden, mikel


In [5]:
# 3. 3 most popular licenses among users (ignoring missing licenses)
licenses_count = repos_df['license_name'].dropna().value_counts().head(3).index.tolist()
print('Top 3 licenses:', ', '.join(licenses_count))

Top 3 licenses: mit, other, apache-2.0


In [6]:
# 4. Company where the majority of these developers work
most_common_company = users_df['company'].mode()[0] if not users_df['company'].isna().all() else 'None'
print('Most common company:', most_common_company)

Most common company: ATLASSIAN


In [7]:
# 5. Most popular programming language
most_popular_language = repos_df['language'].mode()[0]
print('Most popular language:', most_popular_language)

Most popular language: JavaScript


In [8]:
# 6. Second most popular language among users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']
repos_after_2020 = repos_df[repos_df['login'].isin(users_after_2020['login'])]
second_popular_language = repos_after_2020['language'].value_counts().index[1]
print('Second most popular language (after 2020):', second_popular_language)

Second most popular language (after 2020): JavaScript


In [9]:
# 7. Language with the highest average stars per repository
language_stars_avg = repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)
highest_avg_stars_language = language_stars_avg.idxmax()
print('Language with highest average stars:', highest_avg_stars_language)

Language with highest average stars: Mermaid


In [10]:
# 8. Top 5 users by leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print('Top 5 leader strength users:', ', '.join(top_5_leader_strength))

Top 5 leader strength users: brendangregg, cornflourblue, Canva, nicknochnack, 0vm


In [11]:
# 9. Correlation between followers and public repositories
followers_repos_corr = users_df[['followers', 'public_repos']].corr().iloc[0, 1]
print('Correlation between followers and public repos:', round(followers_repos_corr, 3))

Correlation between followers and public repos: 0.033


In [12]:
from sklearn.linear_model import LinearRegression

# 10. Regression: additional followers per public repository
X_repos = users_df[['public_repos']].values
y_followers = users_df['followers'].values
reg_model = LinearRegression().fit(X_repos, y_followers)
followers_per_repo_slope = reg_model.coef_[0]
print('Followers per repo slope:', round(followers_per_repo_slope, 3))

Followers per repo slope: 0.063


In [13]:
# 11. Correlation between having projects and having wiki enabled
projects_wiki_corr = repos_df[['has_projects', 'has_wiki']].corr().iloc[0, 1]
print('Correlation between projects and wiki:', round(projects_wiki_corr, 3))

Correlation between projects and wiki: 0.262


In [14]:
# 12. Difference in average following between hireable and non-hireable users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
following_diff = avg_following_hireable - avg_following_non_hireable
print('Following difference between hireable and non-hireable:', round(following_diff, 3))

Following difference between hireable and non-hireable: 52.602


In [15]:
# 13. Regression of followers on bio word count (ignore empty bios)
users_df['bio_word_count'] = users_df['bio'].fillna('').apply(lambda x: len(x.split()))
users_with_bio = users_df[users_df['bio_word_count'] > 0]
X_bio = users_with_bio[['bio_word_count']].values
y_bio_followers = users_with_bio['followers'].values
bio_reg_model = LinearRegression().fit(X_bio, y_bio_followers)
followers_per_bio_word_slope = bio_reg_model.coef_[0]
print('Followers per bio word slope:', round(followers_per_bio_word_slope, 3))

Followers per bio word slope: -9.724


In [16]:
# 14. Users with the most repositories created on weekends (UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['created_weekend'] = repos_df['created_at'].dt.weekday >= 5
weekend_repos_count = repos_df[repos_df['created_weekend']].groupby('login').size().nlargest(5).index.tolist()
print('Top 5 users with most weekend repos:', ', '.join(weekend_repos_count))

Top 5 users with most weekend repos: timgates42, pinkforest, mikeyhodl, ozbillwang, XertroV


In [17]:
# 15. Fraction of users with email when hireable vs non-hireable
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_fraction_diff = hireable_with_email - non_hireable_with_email
print('Email share difference (hireable vs non-hireable):', round(email_fraction_diff, 3))

Email share difference (hireable vs non-hireable): 0.046


In [18]:
# 16. Most common surname (based on the last word in the name)
users_df['surname'] = users_df['name'].fillna('').apply(lambda x: x.strip().split()[-1] if x else '')
surname_counts = users_df['surname'].value_counts()
most_common_surnames = surname_counts[surname_counts == surname_counts.max()].index.tolist()
print('Most common surname(s):', ', '.join(sorted(most_common_surnames)))

Most common surname(s): 


In [19]:
# Improved code to find the most common surname
import pandas as pd

# Load users.csv
users_df = pd.read_csv('users.csv')

# Clean the 'name' column and extract the surname
users_df['name'] = users_df['name'].fillna('').str.strip()  # Remove leading/trailing spaces and handle NaN
users_df['surname'] = users_df['name'].apply(lambda x: x.split()[-1] if len(x.split()) > 1 else '')

# Filter out rows where surname is still empty
users_with_surnames = users_df[users_df['surname'] != '']

# Get the most common surname(s)
surname_counts = users_with_surnames['surname'].value_counts()
most_common_surnames = surname_counts[surname_counts == surname_counts.max()].index.tolist()

# Output the result
print('Most common surname(s):', ', '.join(sorted(most_common_surnames)))

Most common surname(s): Wu, Zhang
