##Fteching##

In [None]:
# Install required libraries (requests)
!pip install requests

# Import necessary libraries
import requests
import csv
from google.colab import files

# Use Colab's input() function to securely input your GitHub Personal Access Token (PAT)
GITHUB_TOKEN = input("Please enter your GitHub Personal Access Token: ")
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

# Function to fetch GitHub users from Singapore with over 100 followers
def fetch_users_in_singapore():
    users = []
    url = "https://api.github.com/search/users?q=location:singapore+followers:>100&per_page=100"

    while url:
        response = requests.get(url, headers=headers)
        response_json = response.json()
        users.extend(response_json['items'])  # Add users to the list

        # Handling pagination using the 'Link' header
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None

    return users

# Function to fetch detailed user information
def fetch_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    return response.json()

# Function to fetch repositories for a user (up to 500 repositories)
def fetch_user_repos(username):
    repos = []
    url = f"https://api.github.com/users/{username}/repos?per_page=100"

    while url:
        response = requests.get(url, headers=headers)
        repos.extend(response.json())  # Add repositories to the list

        # Handling pagination using the 'Link' header
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None

    return repos

# Helper function to clean company names
def clean_company_name(company):
    if company:
        return company.strip().lstrip('@').upper()  # Remove @ and extra spaces, convert to uppercase
    return ''

# Write users data to CSV
def write_users_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        # Writing the headers
        writer.writerow(["login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"])

        for user in users:
            writer.writerow([
                user['login'],
                user['name'],
                clean_company_name(user.get('company', '')),
                user['location'],
                user.get('email', ''),
                user.get('hireable', False),  # Ensure boolean is in lowercase
                user.get('bio', ''),
                user['public_repos'],
                user['followers'],
                user['following'],
                user['created_at']
            ])

# Write repositories data to CSV
def write_repos_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["login", "full_name", "created_at", "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"])

        for repo in repos:
            writer.writerow([
                repo['owner']['login'],  # Get the owner's login
                repo['full_name'],
                repo['created_at'],
                repo['stargazers_count'],
                repo['watchers_count'],
                repo['language'],
                repo['has_projects'],
                repo['has_wiki'],
                repo['license']['key'] if repo.get('license') else ''  # Use empty string for null
            ])

# Write README.md file
def write_readme():
    with open('README.md', 'w') as file:
        file.write("# GitHub Users from Singapore\n")
        file.write("This repository contains data about GitHub users from Singapore with over 100 followers and their public repositories.\n")
        file.write("The following files are included:\n")
        file.write("- `users.csv`: Contains user information.\n")
        file.write("- `repositories.csv`: Contains user repository information.\n")

# Main function to orchestrate the process
def main():
    # Fetch users in Singapore with more than 100 followers
    users = fetch_users_in_singapore()

    # Fetch detailed information for each user
    detailed_users = [fetch_user_details(user['login']) for user in users]

    # Fetch repositories for each user
    repos = []
    for user in detailed_users:
        user_repos = fetch_user_repos(user['login'])
        repos.extend(user_repos)  # Collect all repositories

    # Write user and repository data to CSV files
    write_users_csv(detailed_users)
    write_repos_csv(repos)
    write_readme()

# Execute the main function
main()

print("Data scraping complete! CSV files and README.md generated.")

# Download the CSV files and README.md to your local machine
files.download('users.csv')
files.download('repositories.csv')
files.download('README.md')

##Q1##

In [None]:
import pandas as pd

# Load the user.csv file into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Filter users based in Singapore (assuming 'location' contains 'Singapore')
singapore_users = users_df[users_df['location'].str.contains('Singapore', case=False, na=False)]

# Sort the users by number of followers in descending order
top_5_singapore_users = singapore_users.sort_values(by='followers', ascending=False).head(5)

# Get the logins of the top 5 users
top_5_logins = top_5_singapore_users['login'].tolist()

# Join the logins into a comma-separated string
top_5_logins_str = ', '.join(top_5_logins)

# Print the result
print(top_5_logins_str)


yyx990803, halfrost, DIYgod, yangshun, bytedance


##Q2##

In [None]:
import pandas as pd

# Load the user.csv file into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Filter users based in Singapore (assuming 'location' contains 'Singapore')
singapore_users = users_df[users_df['location'].str.contains('Singapore', case=False, na=False)]

# Convert 'created_at' to datetime for proper sorting
singapore_users['created_at'] = pd.to_datetime(singapore_users['created_at'])

# Sort the users by 'created_at' in ascending order
earliest_users = singapore_users.sort_values(by='created_at').head(5)

# Get the logins of the 5 earliest registered users
earliest_logins = earliest_users['login'].tolist()

# Join the logins into a comma-separated string
earliest_logins_str = ', '.join(earliest_logins)

# Print the result
print(earliest_logins_str)


chuyeow, choonkeat, winston, cheeaun, nowa


##Q3##

In [None]:
import pandas as pd

# Load the repository.csv file into a DataFrame
repositories_df = pd.read_csv('/content/repositories.csv')

# Filter out rows where 'license_name' is missing or empty
repositories_filtered = repositories_df[repositories_df['license_name'].notna() & (repositories_df['license_name'] != '')]

# Count the occurrences of each license type
license_counts = repositories_filtered['license_name'].value_counts()

# Get the top 3 most popular licenses
top_3_licenses = license_counts.head(3)

# Join the license names into a comma-separated string
top_3_licenses_str = ', '.join(top_3_licenses.index)

# Print the result
print(top_3_licenses_str)


mit, apache-2.0, other


##Q4##

In [None]:
import pandas as pd

# Load the user.csv file into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Clean the 'company' column: Remove missing or irrelevant values (e.g., empty strings)
cleaned_companies = users_df['company'].dropna().str.strip()

# Count the occurrences of each company
company_counts = cleaned_companies.value_counts()

# Get the company with the most developers
most_common_company = company_counts.idxmax()

# Print the result
print(most_common_company)


NATIONAL UNIVERSITY OF SINGAPORE


##Q5##

In [None]:
import pandas as pd

# Load the repository.csv file into a DataFrame
repositories_df = pd.read_csv('/content/repositories.csv')

# Clean the 'language' column: Remove missing or empty values
cleaned_languages = repositories_df['language'].dropna().str.strip()

# Count the occurrences of each language
language_counts = cleaned_languages.value_counts()

# Get the most popular language
most_popular_language = language_counts.idxmax()

# Print the result
print(most_popular_language)


JavaScript


##Q6##

In [None]:
import pandas as pd

# Load the repository.csv file into a DataFrame
repositories_df = pd.read_csv('/content/repositories.csv')

# Load the user.csv file into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Convert the 'created_at' column to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']

# Merge the filtered users with their repositories to focus on relevant repositories
filtered_repositories_df = repositories_df[repositories_df['login'].isin(users_after_2020['login'])]

# Clean the 'language' column: Remove missing or empty values
cleaned_languages = filtered_repositories_df['language'].dropna().str.strip()

# Count the occurrences of each language
language_counts = cleaned_languages.value_counts()

# Get the second most popular language by name (if available)
if len(language_counts) > 1:
    second_most_popular_language = language_counts.index[1]  # Get the name of the second most popular language
else:
    second_most_popular_language = None  # Handle case where there is no second language

# Print the result
print(second_most_popular_language)


Python


##Q7##

In [None]:
import pandas as pd

# Load the repository.csv file into a DataFrame
repositories_df = pd.read_csv('/content/repositories.csv')

# Clean the 'language' column: Remove missing or empty values
cleaned_repositories_df = repositories_df.dropna(subset=['language'])
cleaned_repositories_df = cleaned_repositories_df[cleaned_repositories_df['language'].str.strip() != '']

# Group by language and calculate the average number of stars for each language
language_avg_stars = cleaned_repositories_df.groupby('language')['stargazers_count'].mean()

# Get the language with the highest average number of stars
most_popular_language = language_avg_stars.idxmax()

# Print the result
print(most_popular_language)


Inno Setup


##Q8##

In [None]:
import pandas as pd

# Load the user.csv file into a DataFrame
users_df = pd.read_csv('/content/users.csv')

# Calculate the leader_strength for each user
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort users by leader_strength in descending order and get the top 5
top_5_leader_strength_users = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Get the logins of the top 5 users in order
top_5_logins = top_5_leader_strength_users['login'].tolist()

# Print the logins as a comma-separated string
print(", ".join(top_5_logins))


bytedance, Jinjiang, cloudflare, JamesNK, Shib-Chain


##Q9##

In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Load the data from the CSV file
df = pd.read_csv('/content/users.csv')

# Ensure that the 'followers' and 'public_repos' columns are numeric
df['followers'] = pd.to_numeric(df['followers'], errors='coerce')
df['public_repos'] = pd.to_numeric(df['public_repos'], errors='coerce')

# Drop rows with missing values in 'followers' or 'public_repos'
df_clean = df.dropna(subset=['followers', 'public_repos'])

# Calculate the Pearson correlation coefficient
correlation, _ = pearsonr(df_clean['followers'], df_clean['public_repos'])

# Print the correlation
print(f"Correlation between followers and public repositories: {correlation:.3f}")


Correlation between followers and public repositories: 0.046


##Q10##

In [None]:
df=pd.read_csv('/content/users.csv')
x=df['public_repos']
y=df['followers']
xx=pd.DataFrame(x)
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(xx,y)
model.coef_

array([1.44094965])

##Q11##

In [None]:
import pandas as pd

# Load the repositories data
repositories_data = pd.read_csv('/content/repositories.csv')  # Ensure this path is correct

# Check the columns to find 'has_projects' and 'has_wiki'
# If these columns don't exist, you might need to adjust the column names based on your CSV
# Assuming 'has_projects' and 'has_wiki' are the relevant columns

# Filter out rows where 'has_projects' or 'has_wiki' are missing or invalid
repositories_data = repositories_data.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
# These columns are boolean, so we calculate the correlation of the 0/1 values
correlation = repositories_data['has_projects'].corr(repositories_data['has_wiki'])

# Print the correlation value rounded to 3 decimal places
print(correlation)


0.2982612297054302


##Q12##

In [None]:
import pandas as pd

# Load the users data
users_data = pd.read_csv('/content/users.csv')  # Ensure the path to your CSV file is correct

# Calculate the average 'following' for hireable and non-hireable users
hireable_users = users_data[users_data['hireable'] == True]
non_hireable_users = users_data[users_data['hireable'] == False]

# Calculate the average 'following' for both groups
avg_following_hireable = hireable_users['following'].mean()
avg_following_non_hireable = non_hireable_users['following'].mean()

# Calculate the difference in averages
following_difference = avg_following_hireable - avg_following_non_hireable

# Print the result rounded to 3 decimal places
print(following_difference)


221.6741501027404


##Q13##

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the users.csv file
users_df = pd.read_csv('/content/users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notnull()]

# Calculate the length of the bio in words
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

# Prepare the features and target variable
X = users_with_bios[['bio_word_count']]
y = users_with_bios['followers']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the regression slope (coefficient)
slope = model.coef_[0]

# Output the result formatted to 3 decimal places
print(slope)


37.43202541943021


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


##Q14##

In [None]:
import pandas as pd

# Load the repository data (make sure your repositories.csv file has the correct path)
repos_data = pd.read_csv('/content/repositories.csv')

# Convert 'created_at' to datetime format
repos_data['created_at'] = pd.to_datetime(repos_data['created_at'])

# Extract the day of the week from 'created_at' (0=Monday, 6=Sunday)
repos_data['day_of_week'] = repos_data['created_at'].dt.dayofweek

# Filter repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_data[repos_data['day_of_week'].isin([5, 6])]

# Count the number of repositories created by each user on weekends
user_weekend_counts = weekend_repos['login'].value_counts()

# Get the top 5 users who created the most repositories on weekends
top_5_users = user_weekend_counts.head(5)

# Print the top 5 users' logins
top_5_logins = ', '.join(top_5_users.index)
print(f"Top 5 users by repositories created on weekends: {top_5_logins}")


##Q15##

In [None]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.07977132626696237

##Q16##

In [None]:
import pandas as pd

# Load the user data (make sure your users.csv file has the correct path)
users_data = pd.read_csv('/content/users.csv')

# Function to extract surname (everything after the first space)
def extract_surname(name):
    if isinstance(name, str) and name.strip():  # Check if the name is a non-empty string
        name = name.strip()  # Trim any leading/trailing spaces
        parts = name.split(' ', 1)  # Split the name by the first space only
        return parts[1] if len(parts) > 1 else ''  # Return everything after the first space as surname
    return ''  # Return empty string for missing or invalid names

# Extract surnames from users' names
users_data['surname'] = users_data['name'].apply(extract_surname)

# Remove rows where surname is empty (invalid name entries)
users_data = users_data[users_data['surname'] != '']

# Count the occurrences of each surname
surname_counts = users_data['surname'].value_counts()

# Find the most common surname(s)
most_common_surnames = surname_counts[surname_counts == surname_counts.max()]

# Get the surnames with the highest count, sorted alphabetically
most_common_surnames_list = sorted(most_common_surnames.index)

# Get the number of users with the most common surname(s)
num_users_with_most_common_surname = most_common_surnames.max()

# Print the results
print(f"Most common surname(s): {', '.join(most_common_surnames_list)}")
print(f"Number of users with the most common surname: {num_users_with_most_common_surname}")
