In [53]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load data
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')


In [54]:


top_5_followers = users_df.nlargest(5, 'followers')['login']
print("Top 5 users in Dublin with the highest number of followers")
print(top_5_followers)


Top 5 users in Dublin with the highest number of followers
0                    orta
1           jeromeetienne
2              jonataslaw
3    steventroughtonsmith
4                    axic
Name: login, dtype: object


In [55]:


earliest_users = users_df.sort_values('created_at').head(5)['login']
print("5 earliest registered GitHub users in Dublin")
print(earliest_users)



5 earliest registered GitHub users in Dublin
75         paulca
443        adrian
85     GavinJoyce
74           amir
437     ciaranlee
Name: login, dtype: object


In [56]:

top_3_licenses = repos_df['license_name'].dropna().value_counts().head(3).index.tolist()
print("3 most popular licenses among these users")
print(top_3_licenses)



3 most popular licenses among these users
['mit', 'apache-2.0', 'other']


In [57]:

most_common_company = users_df['company'].mode()[0]
print("Company with the majority of developers")
print(most_common_company)



Company with the majority of developers
MICROSOFT


In [58]:

most_popular_language = repos_df['language'].mode()[0]
print("Most popular programming language")
print(most_popular_language)



Most popular programming language
JavaScript


In [59]:

users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']
second_most_popular_language = repos_df[repos_df['login'].isin(users_after_2020['login'])]['language'].value_counts().index[1]
print("Second most popular language among users who joined after 2020")
print(second_most_popular_language)



Second most popular language among users who joined after 2020
JavaScript


In [60]:

avg_stars_by_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with the highest average stars per repository")
print(avg_stars_by_language)



Language with the highest average stars per repository
MDX


In [61]:

users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login']
print("Top 5 users by leader_strength")
print(top_5_leader_strength)



Top 5 users by leader_strength
6     flaviohenriquealmeida
12                  zalando
15               AnikSarker
19                      wix
28           CardinalHealth
Name: login, dtype: object


In [62]:

correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])
print("Correlation between followers and public repositories")
print(f"{correlation_followers_repos:.3f}")



Correlation between followers and public repositories
0.555


In [63]:

from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])
print("Regression slope of followers on repos")
print(f"{slope:.3f}")



Regression slope of followers on repos
2.828


In [64]:

projects_wiki_correlation = repos_df['has_projects'].astype(bool).corr(repos_df['has_wiki'].astype(bool))
print("Correlation between having projects and wiki enabled")
print(f"{projects_wiki_correlation:.3f}")


Correlation between having projects and wiki enabled
0.318


In [65]:

hireable_following_diff = users_df[users_df['hireable'] == True]['following'].mean() - users_df[users_df['hireable'] == False]['following'].mean()
print("Difference in following between hireable and non-hireable users")
print(f"{hireable_following_diff:.3f}")



Difference in following between hireable and non-hireable users
nan


In [66]:

users_with_bio = users_df.dropna(subset=['bio'])
users_with_bio['bio_length'] = users_with_bio['bio'].apply(lambda x: len(x.split()))
slope_bio_followers, _, _, _, _ = linregress(users_with_bio['bio_length'], users_with_bio['followers'])
print("Correlation of bio length with followers")
print(f"{slope_bio_followers:.3f}")



Correlation of bio length with followers
7.183


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_length'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


In [67]:

repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['created_weekend'] = repos_df['created_at'].dt.dayofweek >= 5
weekend_repo_counts = repos_df[repos_df['created_weekend']].groupby('login').size()
top_5_weekend_creators = weekend_repo_counts.nlargest(5).index.tolist()
print("User with most repositories created on weekends")
print(top_5_weekend_creators)



User with most repositories created on weekends
['orta', 'joshuacassidy', 'No9', 'wafuwafu13', 'lmammino']


In [68]:

hireable_email_fraction = users_df[users_df['hireable'] == True]['email'].notnull().mean()
non_hireable_email_fraction = users_df[users_df['hireable'] == False]['email'].notnull().mean()
email_fraction_diff = hireable_email_fraction - non_hireable_email_fraction
print("Difference in email-sharing fraction between hireable and non-hireable users")
print(f"{email_fraction_diff:.3f}")



Difference in email-sharing fraction between hireable and non-hireable users
nan


In [69]:

users_with_names = users_df.dropna(subset=['name'])
users_with_names['surname'] = users_with_names['name'].apply(lambda x: x.strip().split()[-1])
most_common_surname = users_with_names['surname'].value_counts().nlargest(1).index.tolist()
print("Most common surname")
print(sorted(most_common_surname))


Most common surname
["O'Sullivan"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].apply(lambda x: x.strip().split()[-1])
