In [None]:
import requests
import csv
from urllib.parse import urljoin

def get_repositories(token, username, per_page=100):
    headers = {'Authorization': f'token {token}'}
    url = f'https://api.github.com/users/{username}/repos'
    repos = []
    page = 1

    while True:
        params = {'per_page': per_page, 'page': page}

        try:
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()

            data = response.json()

            if not data:
                break

            for repo in data:
                license_name = None
                if repo.get('license') is not None:
                    license_name = repo.get('license', {}).get('name', None)

                repo_data = {
                    'login': username,  # Add user login from username
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo.get('language', None),
                    'has_projects': repo.get('has_projects', False),
                    'has_wiki': repo.get('has_wiki', False),
                    'license_name': license_name
                }

                repos.append(repo_data)

            if len(repos) >= 500:
                break  # Reached desired limit

            page += 1

        except requests.exceptions.RequestException as e:
            print(f"Error fetching repos for {username}: {e}")
            break

    return repos

def write_to_csv(data, filename='repositories.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        fieldnames = ['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for row in data:
            writer.writerow(row)

    print(f"CSV file created: {filename}")

def main():
    token = 'My_token'#removed my token for security purposes

    # Assuming users.csv is accessible via the provided link
    # user_data_url = 'https://drive.google.com/file/d/1uevdg9JbkYlxVwXdbxIDl-7MfpjkyZNA/view?usp=sharing'
    user_data = []

    # Download users.csv (implement your preferred download method)
    # ... (download logic)

    # Read user data from downloaded users.csv
    with open('stockholm_users.csv', mode='r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            user_data.append(row)

    repos_data = []
    for user in user_data:
        repos = get_repositories(token, user['login'])
        repos_data.extend(repos[:500])  # Limit to 500 repositories

    write_to_csv(repos_data)

if __name__ == "__main__":
    main()

CSV file created: repositories.csv


In [37]:
import csv
import datetime

def get_earliest_users(filename):
    """Reads a CSV file, sorts users by creation date, and returns the top 5 earliest users from Stockholm.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 5 earliest users' logins.
    """

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        users = []
        for row in reader:
            if row['location'] == 'Stockholm':
                user_data = {
                    'login': row['login'],
                    'created_at': datetime.datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
                }
                users.append(user_data)

    # Sort users by creation date
    users.sort(key=lambda x: x['created_at'])

    # Return the top 5 earliest users' logins
    return [user['login'] for user in users[:5]]

# Assuming 'users.csv' is in the same directory
filename = 'stockholm_users.csv'
earliest_users = get_earliest_users(filename)

print(earliest_users)

['kallepersson', 'pirelenito', 'dalen', 'torkelo', 'possan']


In [36]:
import csv
from collections import Counter

def get_top_licenses(filename):
    """Reads a CSV file, counts license occurrences, and returns the top 3 most popular licenses.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 3 most popular license names.
    """

    license_counts = Counter()
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            license_name = row['license_name']
            if license_name:
                license_counts[license_name] += 1

    # Get the top 3 most common licenses
    top_licenses = license_counts.most_common(3)
    return [license[0] for license in top_licenses]

# Assuming 'users.csv' is in the same directory
filename = 'repositories.csv'
top_licenses = get_top_licenses(filename)

print(top_licenses)

['MIT License', 'Apache License 2.0', 'Other']


In [38]:
import csv
import datetime

def get_earliest_users(filename):
    """Reads a CSV file, sorts users by creation date, and returns the top 5 earliest users from Stockholm.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 5 earliest users' logins.
    """

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        users = []
        for row in reader:
            if row['location'] == 'Stockholm':
                user_data = {
                    'login': row['login'],
                    'created_at': datetime.datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
                }
                users.append(user_data)

    # Sort users by creation date
    users.sort(key=lambda x: x['created_at'])

    # Return the top 5 earliest users' logins
    return [user['login'] for user in users[:5]]

# Assuming 'stockholm_users.csv' is in the same directory
filename = 'stockholm_users.csv'
earliest_users = get_earliest_users(filename)

print(earliest_users)

['kallepersson', 'pirelenito', 'dalen', 'torkelo', 'possan']


In [41]:
import csv
from collections import Counter

def get_most_common_company(filename):
    """Reads a CSV file, counts company occurrences, and returns the most common company.

    Args:
        filename: The path to the CSV file.

    Returns:
        The most common company name.
    """

    company_counts = Counter()
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            company = row['company']
            if company:
                company_counts[company] += 1

    # Get the most common company
    most_common_company, _ = company_counts.most_common(1)[0]
    return most_common_company

# Assuming 'stockholm_users.csv' is in the same directory
filename = 'users.csv'
most_common_company = get_most_common_company(filename)

print(most_common_company)

SPOTIFY


In [40]:
import csv
from collections import Counter

def get_most_popular_language(filename):
    """Reads a CSV file, counts language occurrences, and returns the most popular language.

    Args:
        filename: The path to the CSV file.

    Returns:
        The most popular language name.
    """

    language_counts = Counter()
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            language = row['language']
            if language:
                language_counts[language] += 1

    # Get the most common language
    most_common_language, _ = language_counts.most_common(1)[0]
    return most_common_language

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
most_popular_language = get_most_popular_language(filename)

print(most_popular_language)

JavaScript


In [42]:
import csv

def get_language_with_highest_avg_stars(filename):
    """Reads a CSV file, calculates the average stars per language, and returns the language with the highest average.

    Args:
        filename: The path to the CSV file.

    Returns:
        The language with the highest average stars per repository.
    """

    language_stars = {}
    language_counts = {}

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            language = row['language']
            stars = int(row['stargazers_count'])

            if language:
                language_stars[language] = language_stars.get(language, 0) + stars
                language_counts[language] = language_counts.get(language, 0) + 1

    # Calculate average stars per language
    language_avg_stars = {}
    for language, total_stars in language_stars.items():
        count = language_counts[language]
        avg_stars = total_stars / count
        language_avg_stars[language] = avg_stars

    # Find the language with the highest average stars
    max_avg_language = max(language_avg_stars, key=language_avg_stars.get)
    return max_avg_language

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
most_popular_language = get_language_with_highest_avg_stars(filename)

print(most_popular_language)

RAML


In [43]:
import csv
from collections import Counter
from datetime import datetime

def get_second_most_popular_language_after_2020(filename):
    """Reads a CSV file, counts language occurrences for users who joined after 2020, and returns the second most popular language.

    Args:
        filename: The path to the CSV file.

    Returns:
        The second most popular language name.
    """

    language_counts = Counter()
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            created_at = datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            if created_at.year > 2020:
                language = row['language']
                if language:
                    language_counts[language] += 1

    # Get the top 2 most common languages
    top_languages = language_counts.most_common(2)

    # Return the second most common language
    return top_languages[1][0]

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
second_most_popular_language = get_second_most_popular_language_after_2020(filename)

print(second_most_popular_language)

TypeScript


In [44]:
import csv

def get_top_leaders(filename):
    """Reads a CSV file, calculates leader strength for each user, and returns the top 5 leaders.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 5 leaders' logins.
    """

    users = []
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            user_data = {
                'login': row['login'],
                'followers': int(row['followers']),
                'following': int(row['following']),
            }
            users.append(user_data)

    # Calculate leader strength for each user
    for user in users:
        user['leader_strength'] = user['followers'] / (1 + user['following'])

    # Sort users by leader strength in descending order
    users.sort(key=lambda x: x['leader_strength'], reverse=True)

    # Return the top 5 leaders' logins
    return [user['login'] for user in users[:5]]

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
top_leaders = get_top_leaders(filename)

print(top_leaders)

['spotify', 'Mojang', 'fornwall', 'joearms', 'EmbarkStudios']


In [45]:
import pandas as pd
import numpy as np

def calculate_correlation(filename):
    """Reads a CSV file, calculates the correlation between followers and public repositories, and prints the result.

    Args:
        filename: The path to the CSV file.
    """

    df = pd.read_csv(filename)

    # Filter for Stockholm users
    df = df[df['location'] == 'Stockholm']

    # Calculate the correlation coefficient
    correlation = df['followers'].corr(df['public_repos'])

    print("Correlation between followers and public repositories:", correlation)

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
calculate_correlation(filename)

Correlation between followers and public repositories: 0.150284414318553


In [46]:
import random

def calculate_correlation(followers, public_repos):
  """
  Calculates the Pearson correlation coefficient between two lists of data.

  Args:
      followers: A list of follower counts for each user.
      public_repos: A list of public repository counts for each user.

  Returns:
      The Pearson correlation coefficient between the two lists.
  """

  if len(followers) != len(public_repos):
    raise ValueError("Lists must be of the same length")

  n = len(followers)
  sum_followers = sum(followers)
  sum_repos = sum(public_repos)
  sum_followers_squared = sum([x**2 for x in followers])
  sum_repos_squared = sum([x**2 for x in public_repos])
  product_sum = sum([followers[i] * public_repos[i] for i in range(n)])

  # Calculate the numerator and denominator for the correlation coefficient
  numerator = product_sum - (sum_followers * sum_repos) / n
  denominator = (
      ((sum_followers_squared - (sum_followers**2) / n) * (sum_repos_squared - (sum_repos**2) / n))
      **0.5
  )

  # Handle the case where the denominator is close to zero (avoid division by zero)
  if abs(denominator) < 1e-10:
    return 0.0

  # Calculate the correlation coefficient
  correlation = numerator / denominator
  return correlation

# Simulate some data for users in Stockholm
followers = [random.randint(100, 10000) for _ in range(100)]
public_repos = [random.randint(1, 100) for _ in range(100)]

# Calculate the correlation coefficient
correlation = calculate_correlation(followers, public_repos)

print(f"Correlation between followers and public repositories: {correlation:.2f}")

Correlation between followers and public repositories: 0.30


In [53]:
import pandas as pd
import numpy as np

def calculate_correlation(filename):
    """Reads a CSV file, calculates the correlation between followers and public repositories, and prints the result.

    Args:
        filename: The path to the CSV file.
    """

    df = pd.read_csv(filename)

    # Filter for Stockholm users
    # df = df[df['location'] == 'Stockholm']

    # Calculate the correlation coefficient
    correlation = df['followers'].corr(df['public_repos'])

    print("Correlation between followers and public repositories:", correlation)

# Assuming 'users.csv' is in the same directory
filename = 'stockholm_users.csv'
calculate_correlation(filename)

Correlation between followers and public repositories: 0.03322019270996551


In [48]:
import pandas as pd

def calculate_correlation(filename):
    """Calculates the correlation between followers and public repositories for Stockholm users.

    Args:
        filename: The path to the CSV file.

    Returns:
        The correlation coefficient.
    """

    df = pd.read_csv(filename)

    # Filter for Stockholm users
    df_stockholm = df[df['location'] == 'Stockholm']

    # Calculate the correlation coefficient
    correlation = df_stockholm['followers'].corr(df_stockholm['public_repos'])

    return correlation

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
correlation_coefficient = calculate_correlation(filename)

print("Correlation between followers and public repositories:", round(correlation_coefficient, 3))

Correlation between followers and public repositories: 0.15


In [52]:
import pandas as pd
import statsmodels.api as sm

def calculate_regression_slope(filename):
    """Calculates the regression slope of followers on public repositories for all users.

    Args:
        filename: The path to the CSV file.

    Returns:
        The regression slope.
    """

    df = pd.read_csv(filename)

    # Perform linear regression on all users (no filtering)
    X = df[['public_repos']]  # Select 'public_repos' for all users
    y = df['followers']      # Select 'followers' for all users
    X = sm.add_constant(X)    # Add a constant term
    model = sm.OLS(y, X).fit() # Fit the regression model

    # Extract the slope coefficient
    slope = model.params['public_repos']

    return slope

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
slope = calculate_regression_slope(filename)

print("Regression slope of followers on repos (all users):", round(slope, 3))

Regression slope of followers on repos (all users): 0.217


In [54]:
import pandas as pd

def get_earliest_users(filename):
    """Gets the 5 earliest registered GitHub users.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 5 earliest users' logins.
    """

    df = pd.read_csv(filename)

    # Sort the DataFrame by the 'created_at' column in ascending order
    df_sorted = df.sort_values(by='created_at')

    # Get the top 5 earliest users' logins
    top_5_earliest_users = df_sorted['login'].head(5).tolist()

    return top_5_earliest_users

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
earliest_users = get_earliest_users(filename)

print(earliest_users)

['Mange', 'kallepersson', 'fesplugas', 'etnt', 'pirelenito']


In [56]:
import pandas as pd
from scipy.stats import chi2_contingency

def analyze_projects_and_wikis(filename):
    """Calculates the correlation between projects and wikis.

    Args:
        filename: The path to the CSV file.

    Returns:
        The correlation coefficient.
    """

    df = pd.read_csv(filename)

    # Create a contingency table
    contingency_table = pd.crosstab(df['has_projects'], df['has_wiki'])

    # Calculate the chi-square statistic and p-value
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Calculate the correlation using the chi-square statistic
    correlation = np.sqrt(chi2 / (len(df) - 1))

    return correlation

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
correlation = analyze_projects_and_wikis(filename)

print("Correlation between projects and wikis:", round(correlation, 3))

Correlation between projects and wikis: 0.374


In [57]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency # Import the missing function

def analyze_projects_and_wikis(filename):
    df = pd.read_csv(filename)

    # Create a contingency table
    contingency_table = pd.crosstab(df['has_projects'], df['has_wiki'])

    # Calculate the correlation using the chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    correlation = np.sqrt(chi2 / (len(df) - 1))

    print("Correlation between projects and wikis:", correlation)

    # Analyze the contingency table
    print(contingency_table)

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
analyze_projects_and_wikis(filename)

Correlation between projects and wikis: 0.37394896700150726
has_wiki      False  True 
has_projects              
False          1072     49
True           5060  29186


In [64]:


import pandas as pd
import statsmodels.api as sm

def analyze_bio_length_and_followers(filename):
    """Analyzes the relationship between bio length and followers.

    Args:
        filename: The path to the CSV file.

    Returns:
        The regression slope of followers on bio word count.
    """

    df = pd.read_csv(filename)

    # Calculate the word count for each bio
    df['bio_word_count'] = df['bio'].apply(lambda x: len(str(x).split()))

    # Perform linear regression
    X = df[['bio_word_count']]
    y = df['followers']
    X = sm.add_constant(X)  # Add a constant term
    model = sm.OLS(y, X).fit()

    # Extract the slope coefficient
    slope = model.params['bio_word_count']

    return slope

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
slope = analyze_bio_length_and_followers(filename)

print("Regression slope of followers on bio word count:", round(slope, 3))

Regression slope of followers on bio word count: 2.079


In [62]:
import pandas as pd
import statsmodels.api as sm

def analyze_bio_length_and_followers(filename):
    """Analyzes the relationship between bio length and followers.

    Args:
        filename: The path to the CSV file.

    Returns:
        The regression slope of followers on bio word count.
    """

    df = pd.read_csv(filename)

    # Calculate the word count for each bio
    df['bio_word_count'] = df['bio'].apply(lambda x: len(str(x).split()))

    # Perform linear regression
    X = df[['bio_word_count']]
    y = df['followers']
    X = sm.add_constant(X)  # Add a constant term
    model = sm.OLS(y, X).fit()

    # Extract the slope coefficient
    slope = model.params['bio_word_count']

    return slope

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
slope = analyze_bio_length_and_followers(filename)

print("Regression slope of followers on bio word count:", round(slope, 3))

Regression slope of followers on bio word count: 2.079


In [65]:
import pandas as pd
import statsmodels.api as sm

def analyze_bio_length_and_followers(filename):
    """Analyzes the relationship between bio length and followers for users with bios.

    Args:
        filename: The path to the CSV file.

    Returns:
        The regression slope of followers on bio word count.
    """

    df = pd.read_csv(filename)

    # Filter out users without bios
    df = df[df['bio'].notnull()]

    # Calculate the word count for each bio
    df['bio_word_count'] = df['bio'].apply(lambda x: len(str(x).split()))

    # Perform linear regression
    X = df[['bio_word_count']]
    y = df['followers']
    X = sm.add_constant(X)  # Add a constant term
    model = sm.OLS(y, X).fit()

    # Extract the slope coefficient
    slope = model.params['bio_word_count']

    return slope

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
slope = analyze_bio_length_and_followers(filename)

print("Regression slope of followers on bio word count:", round(slope, 3))


Regression slope of followers on bio word count: 6.574


In [66]:
import pandas as pd
import datetime

def get_top_weekend_contributors(filename):
    """Identifies the top 5 users who created the most repositories on weekends.

    Args:
        filename: The path to the CSV file.

    Returns:
        A list of the top 5 users' logins.
    """

    df = pd.read_csv(filename)

    # Convert 'created_at' to datetime objects
    df['created_at'] = pd.to_datetime(df['created_at'])

    # Determine if the repository was created on a weekend
    df['is_weekend'] = df['created_at'].dt.weekday.isin([5, 6])

    # Group by user and count the number of weekend repositories
    weekend_repos = df[df['is_weekend']].groupby('login').size()

    # Sort the results and get the top 5
    top_contributors = weekend_repos.sort_values(ascending=False).head(5)

    return top_contributors.index.tolist()

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
top_weekend_contributors = get_top_weekend_contributors(filename)

print(top_weekend_contributors)

['HaraldNordgren', 'Nyholm', 'lydell', 'LinusU', 'leostera']


In [70]:
import pandas as pd

def compare_hireable_following(filename):
    """Compares the average 'following' for hireable and non-hireable users.

    Args:
        filename: The path to the CSV file.

    Returns:
        The difference in average 'following' between hireable and non-hireable users.
    """

    df = pd.read_csv(filename)

    # Assuming 'hireable' might have missing values, fill them with False
    df['hireable'] = df['hireable'].fillna(False)

    # Assuming 'hireable' is a boolean column
    hireable_avg_following = df[df['hireable']]['following'].mean()
    non_hireable_avg_following = df[~df['hireable']]['following'].mean()

    return round(hireable_avg_following - non_hireable_avg_following, 3)

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
following_difference = compare_hireable_following(filename)

print("Difference in average following between hireable and non-hireable users:", following_difference)

Difference in average following between hireable and non-hireable users: 48.589


  df['hireable'] = df['hireable'].fillna(False)


In [71]:
import pandas as pd
from collections import Counter

def get_most_common_surname(filename):
    """Gets the most common surname among users.

    Args:
        filename: The path to the CSV file.

    Returns:
        The most common surname(s).
    """

    df = pd.read_csv(filename)

    # Extract the last name from the 'name' column
    df['surname'] = df['name'].apply(lambda x: x.strip().split()[-1] if isinstance(x, str) else None)

    # Count the occurrences of each surname
    surname_counts = Counter(df['surname'].dropna())

    # Get the most common surnames
    most_common_surnames = [surname for surname, count in surname_counts.most_common() if count == surname_counts.most_common(1)[0][1]]

    return most_common_surnames

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
most_common_surnames = get_most_common_surname(filename)

print(most_common_surnames)

['Persson', 'Gustafsson']


In [72]:
import pandas as pd
from scipy.stats import chi2_contingency

def analyze_projects_and_wikis(filename):
    """Calculates the correlation between projects and wikis.

    Args:
        filename: The path to the CSV file.

    Returns:
        The correlation coefficient.
    """

    df = pd.read_csv(filename)

    # Create a contingency table
    contingency_table = pd.crosstab(df['has_projects'], df['has_wiki'])

    # Calculate the chi-square statistic and p-value
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Calculate the correlation using the chi-square statistic
    correlation = np.sqrt(chi2 / (len(df) - 1))

    return correlation

# Assuming 'repositories.csv' is in the same directory
filename = 'repositories.csv'
correlation = analyze_projects_and_wikis(filename)

print("Correlation between projects and wikis:", round(correlation, 3))

Correlation between projects and wikis: 0.374


In [74]:
import pandas as pd

def compare_hireable_following(filename):
    """Compares the average 'following' for hireable and non-hireable users.

    Args:
        filename: The path to the CSV file.

    Returns:
        The difference in average 'following' between hireable and non-hireable users.
    """

    df = pd.read_csv(filename)

    # Assuming 'hireable' might have missing values, fill them with False
    df['hireable'] = df['hireable'].fillna(False)

    # Assuming 'hireable' is a boolean column
    hireable_avg_following = df[df['hireable']]['following'].mean()
    non_hireable_avg_following = df[~df['hireable']]['following'].mean()

    return round(hireable_avg_following - non_hireable_avg_following, 3)

# Assuming 'users.csv' is in the same directory
filename = 'users.csv'
following_difference = compare_hireable_following(filename)

print("Difference in average following between hireable and non-hireable users:", following_difference)

Difference in average following between hireable and non-hireable users: 48.589


  df['hireable'] = df['hireable'].fillna(False)
