In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
df = pd.read_csv('datasets/natural_science_students.csv')
df.head()

In [None]:
df.isna().sum()

# Top Scorers

In [None]:
# Initialize scores and combinations
math_score = df['math']
literature_score = df['literature']
english_score = df['english']
physics_score = df['physics']
chemistry_score = df['chemistry']
biology_score = df['biology']

combinations = {
    'A00': ['math', 'physics', 'chemistry'],
    'A01': ['math', 'english', 'physics'],
    'D01': ['math', 'english', 'literature'],
    'A02': ['math', 'physics', 'biology'],
    'B00': ['math', 'biology', 'chemistry']
}

In [None]:
def calculate_statistics(series):
    return pd.Series({
        'Mean': series.mean(),
        'Median': series.median(),
        'Mode': series.mode().iloc[0],
        'Variance': series.var(),
        'Std Dev': series.std(),
        'Range': series.max() - series.min(),
        'Highest': series.max(),
        'Lowest': series.min()
    })
    
# Calculate statistics for each subject
subjects = ['math', 'literature', 'english', 'physics', 'chemistry', 'biology']
stats_df = df[subjects].apply(calculate_statistics)

print(stats_df.T)

In [None]:
def count_students_with_score(df, subject, score):
    # Filter the dataframe to include only students with the specified score
    students_with_score = df[df[subject] == score]
    
    # Count the number of students
    count = len(students_with_score)
    
    return f"There are {count} students with a score of {score} in {subject.capitalize()}."

count = count_students_with_score(df, 'math', 9.8)
print(count)

In [None]:
def find_highest_score(df, subject):
    # Sort the dataframe by the subject score in descending order
    sorted_df = df.sort_values(by=subject, ascending=False)
    
    # Get the highest score
    highest_score = sorted_df[subject].iloc[0]
    
    # Filter the dataframe to include only students with the highest score
    highest_scorers = sorted_df[sorted_df[subject] == highest_score]
    
    return highest_scorers

top_literature_students = find_highest_score(df, 'math')
print(top_literature_students)

In [None]:
def calculate_score(row, combination):
    return sum(row[subject] for subject in combination)

def rank_students(df, combination):
    # Check if all required columns are present
    missing_columns = [col for col in combination if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns in DataFrame: {missing_columns}")
    
    df['score'] = df.apply(lambda row: calculate_score(row, combination), axis=1)
    df['rank'] = df['score'].rank(method='min', ascending=False)
    
    # Count students with the same rank
    rank_count = defaultdict(int)
    for rank in df['rank']:
        rank_count[rank] += 1
    
    rankings = df[['student_id', 'rank']].set_index('student_id')['rank'].to_dict()
    return rankings, rank_count

def get_student_rank(student_id, combination_code):\
    # Check if the combination code is valid
    if combination_code not in combinations:
        return f"Invalid combination code: {combination_code}. Available codes are: {list(combinations.keys())}"
    
    combination = combinations[combination_code]
    
    try:
        rankings, rank_count = rank_students(df, combination)
        
        # Convert student_id to the same type as in the rankings
        converted_student_id = type(list(rankings.keys())[0])(student_id)
        
        if converted_student_id in rankings:
            student_rank = int(rankings[converted_student_id])
            students_with_same_rank = rank_count[student_rank]
            return f"Your rank for combination {combination_code} is: {student_rank}. " \
                   f"There are {students_with_same_rank} student(s) with the same rank as you."
        else:
            return f"Student ID {student_id} not found. Available IDs: {list(rankings.keys())[:5]}..."
    except Exception as e:
        return f"An error occurred: {str(e)}"
print(get_student_rank('2005436', 'A01'))

# Scores Breakdown by Subjects

In [None]:
def plot_subject_histogram(scores, subject):
    plt.figure(figsize=(12, 8))
    
    # Determine the interval based on the subject
    if subject.lower() in ['math', 'english']:
        interval = 0.2
    else:
        interval = 0.25
    
    # Round scores to nearest interval to address floating-point imprecision
    rounded_scores = np.round(scores / interval) * interval
    
    # Create bins with a slightly larger range to ensure all data points are included
    bins = np.arange(0, 10.01 + interval, interval)
    
    # Create histogram with custom bins
    counts, bins, patches = plt.hist(rounded_scores, bins=bins, edgecolor='black', align='left')
    
    # Add count labels above each bar
    for i in range(len(counts)):
        if counts[i] > 0:
            plt.text(bins[i], counts[i] + 500, f'{int(counts[i])}', 
                     ha='center', va='bottom', rotation=90)
    
    plt.title(f'Histogram of {subject.capitalize()} Scores', fontsize=16)
    plt.xlabel('Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.xlim(0, 10)
    plt.ylim(0, max(counts) * 1.3)  # Extend y-axis to make room for vertical labels
    
    # Set x-axis ticks to show all intervals
    plt.xticks(np.arange(0, 10.01, interval), rotation=90, ha='center')
    
    # Adjust layout to prevent cutting off labels
    plt.tight_layout()
    
    # Add grid lines
    plt.grid(True, alpha=0.3)
    plt.grid(which='minor', alpha=0.2)
    
    plt.show()

In [None]:
plot_subject_histogram(math_score, 'math')

In [None]:
plot_subject_histogram(literature_score, 'literature')

In [None]:
plot_subject_histogram(english_score, 'english')

In [None]:
plot_subject_histogram(physics_score, 'physics')

In [None]:
plot_subject_histogram(chemistry_score, 'chemistry')

In [None]:
plot_subject_histogram(biology_score, 'biology')

# Scores Breadown by Combinations

In [None]:
def plot_combination_histogram(x1, x2, x3, subject1, subject2, subject3):
    plt.figure(figsize=(15, 10))
    
    # Calculate the sum of scores for each student
    combined_scores = x1 + x2 + x3
    
    # Set the interval for bars to 0.2
    bar_interval = 0.2
    
    # Round scores to nearest bar interval to address floating-point imprecision
    rounded_scores = np.round(combined_scores / bar_interval) * bar_interval
    
    # Create bins with a slightly larger range to ensure all data points are included
    max_possible_score = 30  # 10 points max per subject, 3 subjects
    bins = np.arange(0, max_possible_score + bar_interval, bar_interval)
    
    # Create histogram with custom bins
    counts, bins, patches = plt.hist(rounded_scores, bins=bins, edgecolor='black', align='left')
    
    # Add count labels above each bar, vertically
    for i in range(len(counts)):
        if counts[i] > 0:
            plt.text(bins[i], counts[i] + 250, str(int(counts[i])), 
                     ha='center', va='bottom', rotation=90)
    
    plt.title(f'Histogram of Combined {subject1.capitalize()}, {subject2.capitalize()}, and {subject3.capitalize()} Scores', fontsize=16)
    plt.xlabel('Combined Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.xlim(0, max_possible_score)
    plt.ylim(0, max(counts) * 1.3)  # Extend y-axis to make room for vertical labels
    
    # Set x-axis ticks to show intervals of 0.5
    label_interval = 0.5
    x_ticks = np.arange(0, max_possible_score + label_interval, label_interval)
    plt.xticks(x_ticks, rotation=45, ha='right')
    
    # Adjust layout to prevent cutting off labels
    plt.tight_layout()
    
    # Add grid lines
    plt.grid(True, alpha=0.3)
    plt.grid(which='minor', alpha=0.2)
    
    plt.show()

In [None]:
a00_combination = plot_combination_histogram(math_score, physics_score, chemistry_score, 'math', 'physics', 'chemistry')

In [None]:
a01_combination = plot_combination_histogram(math_score, physics_score, english_score, 'math', 'physics', 'english')

In [None]:
b00_combination = plot_combination_histogram(math_score, biology_score, chemistry_score, 'math', 'biology', 'chemistry')

In [None]:
d01_combination = plot_combination_histogram(math_score, literature_score, english_score, 'math', 'literature', 'english')

# That's the end!