In [1]:
# Importing required libraries
import numpy as np
import pandas as pd

# Step 1: Create synthetic data using NumPy
np.random.seed(42)  # Setting a seed ensures reproducibility; results will always be the same.

# Defining names of students and subjects
names = ['Alice', 'Bob', 'Charlie', 'David', 'Eve']  # List of student names
subjects = ['Math', 'Science', 'English', 'Bangla']  # List of subjects

# Generating random scores for students in each subject
# Scores will be integers between 50 and 100 for a 5x4 matrix (5 students, 4 subjects)
scores = np.random.randint(50, 100, size=(5, 4))

# Displaying the generated data
print('Names:', names)
print('Subjects:', subjects)
print('Scores (raw data):\n', scores)

Names: ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
Subjects: ['Math', 'Science', 'English', 'Bangla']
Scores (raw data):
 [[88 78 64 92]
 [57 70 88 68]
 [72 60 60 73]
 [85 89 73 52]
 [71 51 73 93]]


In [2]:
# Step 2: Create a DataFrame with Pandas
# Create a DataFrame where rows represent students and columns represent subjects
df = pd.DataFrame(scores, index=names, columns=subjects)
print("\nInitial DataFrame:\n", df)


Initial DataFrame:
          Math  Science  English  Bangla
Alice      88       78       64      92
Bob        57       70       88      68
Charlie    72       60       60      73
David      85       89       73      52
Eve        71       51       73      93


In [3]:
# Step 3: Calculate statistics
# Calculate the average score for each student
# axis=1 means the operation (mean) is performed across columns (i.e., per row/student)
average_scores = df.mean(axis=1)

# Add a new column 'Average' to the DataFrame to store the average scores
df['Average'] = average_scores
print("\nDataFrame with Average Scores:\n", df)


DataFrame with Average Scores:
          Math  Science  English  Bangla  Average
Alice      88       78       64      92    80.50
Bob        57       70       88      68    70.75
Charlie    72       60       60      73    66.25
David      85       89       73      52    74.75
Eve        71       51       73      93    72.00


In [4]:
# Step 4: Identify top performers
# Find the name of the student with the highest average score
# idxmax() returns the index (student name) of the maximum value in the 'Average' column
top_performer = df['Average'].idxmax()
print("\nTop Performer:", top_performer)


Top Performer: Alice


In [5]:
# Step 5: Normalize scores
# Normalization rescales data to a range of 0 to 1
# Formula: normalized_value = (value - min_value) / (max_value - min_value)
normalized_scores = (scores - scores.min()) / (scores.max() - scores.min())

# Create a new DataFrame to store normalized scores, keeping the same structure as the original
df_normalized = pd.DataFrame(normalized_scores, index=names, columns=subjects)
print("\nNormalized DataFrame (scores scaled to 0-1):\n", df_normalized)


Normalized DataFrame (scores scaled to 0-1):
              Math   Science   English    Bangla
Alice    0.880952  0.642857  0.309524  0.976190
Bob      0.142857  0.452381  0.880952  0.404762
Charlie  0.500000  0.214286  0.214286  0.523810
David    0.809524  0.904762  0.523810  0.023810
Eve      0.476190  0.000000  0.523810  1.000000


In [6]:
# Step 6: Save processed data to a CSV file
# Save the DataFrame (including averages) to a CSV file
# df.to_csv('student_scores.csv', index=True)  # index=True includes the student names
# print("\nProcessed data saved to 'student_scores.csv'.")

# Advanced Step: Transpose DataFrame (swap rows and columns)
# Transpose the DataFrame so that subjects become the index and students become the columns
df_transposed = pd.DataFrame(scores.T, index=subjects, columns=names)
print("\nDataFrame with Subjects as Index:\n", df_transposed)


DataFrame with Subjects as Index:
          Alice  Bob  Charlie  David  Eve
Math        88   57       72     85   71
Science     78   70       60     89   51
English     64   88       60     73   73
Bangla      92   68       73     52   93


In [7]:
# Accessing specific data in the transposed DataFrame
# Example 1: Get all scores for Bob (a single column from the transposed DataFrame)
print("\nScores for Bob:\n", df_transposed['Bob'])


Scores for Bob:
 Math       57
Science    70
English    88
Bangla     68
Name: Bob, dtype: int32


In [8]:
# Example 2: Get all students' scores in Math (a single row from the transposed DataFrame)
print("\nScores in Math:\n", df_transposed.loc['Math'])


Scores in Math:
 Alice      88
Bob        57
Charlie    72
David      85
Eve        71
Name: Math, dtype: int32


In [9]:
# Adding new statistics: Average score per subject
# axis=1 calculates averages for each subject (across students in this layout)
df_transposed['Subject Average'] = df_transposed.mean(axis=1)
print("\nDataFrame with Subject Averages:\n", df_transposed)


DataFrame with Subject Averages:
          Alice  Bob  Charlie  David  Eve  Subject Average
Math        88   57       72     85   71             74.6
Science     78   70       60     89   51             69.6
English     64   88       60     73   73             71.6
Bangla      92   68       73     52   93             75.6


In [10]:
# Advanced Example: Conditional Filtering, Grouping, and Custom Functions
# Advanced 2.1: Conditional filtering
# Find all students who scored above 80 in Science
high_scorers_science = df[df['Science'] > 80]
print("\nStudents scoring above 80 in Science:\n", high_scorers_science)


Students scoring above 80 in Science:
        Math  Science  English  Bangla  Average
David    85       89       73      52    74.75


In [11]:
# Advanced 2.2: Adding a performance
# Define a function to categorize students based on their average score
def performance(avg):
    if avg >= 85:
        return 'Outstanding'
    elif avg >= 70:
        return 'Good'
    else:
        return 'Needs Improvement'

# Apply the function to the 'Average' column and create a new column
df['Performance'] = df['Average'].apply(performance)
print("\nDataFrame with Performance Categories:\n", df)


DataFrame with Performance Categories:
          Math  Science  English  Bangla  Average        Performance
Alice      88       78       64      92    80.50               Good
Bob        57       70       88      68    70.75               Good
Charlie    72       60       60      73    66.25  Needs Improvement
David      85       89       73      52    74.75               Good
Eve        71       51       73      93    72.00               Good


In [12]:
# Advanced 2.3: Grouping and aggregation
# Group by performance category and calculate statistics for each group
performance_summary = df.groupby('Performance').agg(
    Average_Score=('Average', 'mean'),
    Max_Score=('Average', 'max'),
    Min_Score=('Average', 'min'),
    Count=('Average', 'count')
)
print("\nPerformance Summary by Category:\n", performance_summary)


Performance Summary by Category:
                    Average_Score  Max_Score  Min_Score  Count
Performance                                                  
Good                       74.50      80.50      70.75      4
Needs Improvement          66.25      66.25      66.25      1


In [13]:
# Advanced 2.4: Identify the student with the highest score in each subject
top_in_each_subject = df[subjects].idxmax()
print("\nTop Student in Each Subject:\n", top_in_each_subject)


Top Student in Each Subject:
 Math       Alice
Science    David
English      Bob
Bangla       Eve
dtype: object
