In [22]:
import pandas as pd
import numpy as np
import csv
import statsmodels.api as sm

from collections import Counter
from datetime import datetime

#### Q1.  Who are the top 5 users in Mumbai with the highest number of followers? List their login in order, comma-separated.

In [3]:
users = pd.read_csv('users.csv')

In [4]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

ValentineFernandes,kovidgoyal,slidenerd,aryashah2k,coding-parrot


#### Q2. Who are the 5 earliest registered GitHub users in Mumbai? List their login in ascending order of created_at, comma-separated.

In [5]:
users['created_at'] = pd.to_datetime(users['created_at'])
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

ivank,sandeepshetty,svs,nitinhayaran,nischal


#### Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [6]:
repos = pd.read_csv('repositories.csv')
print(','.join(repos['license_name'].value_counts().head(3).keys()))

mit,apache-2.0,other


#### Q4. Which company do the majority of these developers work at?

In [8]:
users['company'].value_counts().head(1).keys()[0]

'MASAI SCHOOL'

#### Q5. Which programming language is most popular among these users?

In [9]:
repos['language'].value_counts().head(1).keys()[0]

'JavaScript'

#### Q6. Which programming language is the second most popular among users who joined after 2020?

In [12]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head(2).keys()[-1]

'HTML'

#### Q7. Which language has the highest average number of stars per repository? 

In [13]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

TSQL 571.4615384615385


#### Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [14]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

kovidgoyal,coding-parrot,gkcs,slidenerd,dmalvia


#### Q9. What is the correlation between the number of followers and the number of public repositories among users in Basel?

In [36]:
correlation = users['followers'].corr(users['public_repos'])
f"{correlation:.3f}"

'0.035'

#### Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [16]:
followers = []
public_repos = []

with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers.append(int(row['followers']))
        public_repos.append(int(row['public_repos']))
    
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)
    print(f"{slope:.3f}")
else:
    print("Error")

0.102


#### Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [17]:
correlation = repos['has_projects'].corr(repos['has_wiki'])
f"{correlation:.3f}"

'0.160'

#### Q12. Do hireable users follow more people than those who are not hireable?

In [18]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
f"{difference:.3f}"

'8.663'

#### Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [21]:
users_with_bios = users[users['bio'].notna()]
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

X = sm.add_constant(users_with_bios['bio_word_count'])
model = sm.OLS(users_with_bios['followers'], X).fit()
slope = model.params['bio_word_count']

print(f'Regression slope of followers on bio word count: {slope:.3f}')

Regression slope of followers on bio word count: -0.486


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


#### Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [23]:
weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])  
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  

top_users = weekend_repo_counts.most_common(5)
print(','.join([user[0] for user in top_users]))

Kushal334,alokproc,vinod1988,patilswapnilv,rajeshpillai


#### Q15. Do people who are hireable share their email addresses more often?

In [24]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
f"{diff:.3f}"

'0.223'

#### Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [25]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
common_surnames = surname_counts[surname_counts == surname_counts.max()].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Singh
