### Import Libraries

In [13]:
import pandas as pd
import numpy as np
from summarytools import dfSummary
import plotly.express as px
from scipy.stats import chi2_contingency

### Read Dataset

In [14]:
df = pd.read_csv('../data/Math.csv', sep=';', encoding='ISO-8859-1')


### Data Exploration

In [15]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9546 entries, 0 to 9545
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Student ID       9546 non-null   int64 
 1   Student Country  9546 non-null   object
 2   Question ID      9546 non-null   int64 
 3   Type of Answer   9546 non-null   int64 
 4   Question Level   9546 non-null   object
 5   Topic            9546 non-null   object
 6   Subtopic         9546 non-null   object
 7   Keywords         9546 non-null   object
dtypes: int64(3), object(5)
memory usage: 596.8+ KB


In [17]:
df.describe()

Unnamed: 0,Student ID,Question ID,Type of Answer
count,9546.0,9546.0,9546.0
mean,775.402263,478.912319,0.468259
std,460.590559,249.244061,0.499018
min,26.0,77.0,0.0
25%,380.0,323.0,0.0
50%,885.0,428.0,0.0
75%,1219.0,571.0,1.0
max,1565.0,1549.0,1.0


In [18]:
dfSummary(df, is_collapsible=False)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Student ID [int64],Mean (sd) : 775.4 (460.6) min < med < max: 26.0 < 885.0 < 1565.0 IQR (CV) : 839.0 (1.7),372 distinct values,,0 (0.0%)
2,Student Country [object],1. Portugal 2. Lithuania 3. Italy 4. Slovenia 5. Ireland 6. Russian Federation 7. Romania 8. Spain,"5,495 (57.6%) 1,443 (15.1%) 1,358 (14.2%) 755 (7.9%) 300 (3.1%) 107 (1.1%) 60 (0.6%) 28 (0.3%)",,0 (0.0%)
3,Question ID [int64],Mean (sd) : 478.9 (249.2) min < med < max: 77.0 < 428.0 < 1549.0 IQR (CV) : 248.0 (1.9),833 distinct values,,0 (0.0%)
4,Type of Answer [int64],1. 0 2. 1,"5,076 (53.2%) 4,470 (46.8%)",,0 (0.0%)
5,Question Level [object],1. Basic 2. Advanced,"7,844 (82.2%) 1,702 (17.8%)",,0 (0.0%)
6,Topic [object],1. Linear Algebra 2. Fundamental Mathematics 3. Complex Numbers 4. Differentiation 5. Analytic Geometry 6. Statistics 7. Numerical Methods 8. Optimization 9. Real Functions of a single var 10. Integration 11. other,"5,726 (60.0%) 818 (8.6%) 592 (6.2%) 579 (6.1%) 358 (3.8%) 340 (3.6%) 310 (3.2%) 182 (1.9%) 164 (1.7%) 144 (1.5%) 333 (3.5%)",,0 (0.0%)
7,Subtopic [object],"1. Vector Spaces 2. Linear Transformations 3. Complex Numbers 4. Algebraic expressions, Equatio 5. Linear Systems 6. Analytic Geometry 7. Statistics 8. Elementary Geometry 9. Derivatives 10. Numerical Methods 11. other","2,749 (28.8%) 2,127 (22.3%) 592 (6.2%) 496 (5.2%) 420 (4.4%) 358 (3.8%) 340 (3.6%) 322 (3.4%) 317 (3.3%) 310 (3.2%) 1,515 (15.9%)",,0 (0.0%)
8,Keywords [object],"1. Linear application,Linearity 2. Simplify expressions 3. Subspace,Linear combination,Sp 4. Linear independence,Span,Linea 5. Range,Kernel 6. Matrix of a linear transformat 7. Subspace,Span,Linear combinati 8. Basis,Dimension 9. Linear combination,Subspace,Sp 10. Basis 11. other","443 (4.6%) 401 (4.2%) 401 (4.2%) 399 (4.2%) 344 (3.6%) 314 (3.3%) 216 (2.3%) 210 (2.2%) 198 (2.1%) 186 (1.9%) 6,434 (67.4%)",,0 (0.0%)


### Questions to be answered

- Are there differences in performance (Type of Answer) between students from different countries (Student Country)?<br>
- Which countries have the highest percentage of correct answers (Type of Answer = 1)?<br>
- How does the distribution of correct vs incorrect answers vary by country?<br>
- Is there a pattern in the Type of Answer (correct or incorrect) by the Question Level?<br>
- What is the average success rate or different question levels (e.g., easy vs hard)?<br>
- Which topics are students performing the best in? (i.e., highest percentage of correct answers per Topic)<br>
- Are there specific subtopics where students struggle more often?<br>

### 1. Are there differences in performance (Type of Answer) between students from different countries (Student Country)?

In [19]:
country_performance = df.groupby('Student Country')['Type of Answer'].value_counts(normalize=True).unstack()
country_performance = country_performance.rename(columns={0: 'Incorrect (%)', 1: 'Correct (%)'}) * 100
country_performance

Type of Answer,Incorrect (%),Correct (%)
Student Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Ireland,54.0,46.0
Italy,55.375552,44.624448
Lithuania,56.410256,43.589744
Portugal,54.613285,45.386715
Romania,41.666667,58.333333
Russian Federation,65.420561,34.579439
Slovenia,31.258278,68.741722
Spain,57.142857,42.857143


In [20]:
contingency_table = df.pivot_table(index='Student Country', columns='Type of Answer', aggfunc='size', fill_value=0)

chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square test result: Chi2 = {chi2}, p-value = {p}")

Chi-square test result: Chi2 = 168.81726517873298, p-value = 4.457454344900484e-33


A p-value < 0.05 indicates that differences in performance between countries are statistically significant and comparing our p-value to 0.05, there is differences in performance between students from different countries.

### 2. Which countries have the highest percentage of correct answers (Type of Answer = 1)?

In [21]:
correct_answer_percentage = df[df['Type of Answer'] == 1].groupby('Student Country').size() \
                                                                   / df.groupby('Student Country').size()
sorted_countries = correct_answer_percentage.sort_values(ascending=False)
sorted_countries.head(10)

Student Country
Slovenia              0.687417
Romania               0.583333
Ireland               0.460000
Portugal              0.453867
Italy                 0.446244
Lithuania             0.435897
Spain                 0.428571
Russian Federation    0.345794
dtype: float64

Slovenia is the the country with the highest percentage (68.7%) of correct answers followed by Romania with 58.3% and then Ireland with 46.0%.

### 3. How does the distribution of correct vs incorrect answers vary by country?

In [22]:
distribution = df.groupby(['Student Country', 'Type of Answer']).size().reset_index(name='Count')

total_counts = distribution.groupby('Student Country')['Count'].transform('sum')
distribution['Percentage'] = (distribution['Count'] / total_counts) * 100

In [23]:
fig = px.bar(
    distribution,
    x='Student Country',
    y='Percentage',
    color='Type of Answer',
    text='Percentage',
    #color_discrete_map={0: 'red', 1: 'green'},
    title='Distribution of Correct vs Incorrect Answers by Country',
    labels={'Percentage': 'Percentage of Answers', 'Type of Answer': 'Type of Answer'},
    barmode='stack',
    height=600,
    width=1000
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='inside')
fig.update_layout(
    xaxis_title="Student Country",
    yaxis_title="Percentage of Answers",
    legend_title="Type of Answer",
    xaxis={'categoryorder': 'total descending'}
)

fig.show()

### 4. Is there a pattern in the Type of Answer (correct or incorrect) by the Question Level?

In [24]:
level_distribution = df.groupby(['Question Level', 'Type of Answer']).size().reset_index(name='Count')

total_counts_level = level_distribution.groupby('Question Level')['Count'].transform('sum')
level_distribution['Percentage'] = (level_distribution['Count'] / total_counts_level) * 100

In [25]:
fig = px.bar(
    level_distribution,
    x='Question Level',
    y='Percentage',
    color='Type of Answer',
    text='Percentage',
    title='Distribution of Correct vs Incorrect Answers by Question Level',
    labels={'Percentage': 'Percentage of Answers', 'Type of Answer': 'Type of Answer'},
    barmode='group',
    height=600,
    width=800
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

fig.update_layout(
    xaxis_title="Question Level",
    yaxis_title="Percentage of Answers",
    legend_title="Type of Answer",
)

fig.show()

### 5. What is the average success rate on different question levels (e.g., easy vs hard)?

In [26]:
success_rate = df[df['Type of Answer'] == 1].groupby('Question Level').size() / df.groupby('Question Level').size()
success_rate = success_rate * 100
success_rate

Question Level
Advanced    50.117509
Basic       46.111678
dtype: float64

In [27]:
success_rate_df = success_rate.reset_index(name='Success Rate (%)')

fig = px.bar(
    success_rate_df,
    x='Question Level',
    y='Success Rate (%)',
    title='Average Success Rate by Question Level',
    labels={'Success Rate (%)': 'Success Rate (%)', 'Question Level': 'Question Level'},
    text='Success Rate (%)',
    height=600,
    width=800,
    color='Success Rate (%)',
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

fig.show()

### 6. Which topics are students performing the best in? (i.e., highest percentage of correct answers per Topic)

In [28]:
correct_answers_by_topic = df[df['Type of Answer'] == 1].groupby('Topic').size()
total_answers_by_topic = df.groupby('Topic').size()
topic_success_rate = (correct_answers_by_topic / total_answers_by_topic) * 100


sorted_topic_success_rate = topic_success_rate.sort_values(ascending=False)
print("Top-Performing Topics (Highest Success Rate):")
print(sorted_topic_success_rate)

Top-Performing Topics (Highest Success Rate):
Topic
Set Theory                             64.285714
Graph Theory                           58.181818
Differential Equations                 53.703704
Linear Algebra                         49.022005
Analytic Geometry                      48.882682
Statistics                             47.941176
Fundamental Mathematics                46.577017
Complex Numbers                        45.439189
Integration                            44.444444
Numerical Methods                      38.709677
Optimization                           38.461538
Probability                            37.500000
Real Functions of a single variable    35.365854
Differentiation                        34.196891
dtype: float64


In [32]:
topic_success_rate_df = pd.DataFrame({
    'Topic': topic_success_rate.index,
    'Success Rate (%)': topic_success_rate.values
})

topic_success_rate_df = topic_success_rate_df.sort_values(by='Success Rate (%)', ascending=True)

fig = px.bar(
    topic_success_rate_df,
    y='Topic',
    x='Success Rate (%)',
    title='Success Rate by Topic',
    labels={'Success Rate (%)': 'Success Rate (%)', 'Topic': 'Topic'},
    text='Success Rate (%)',
    height=600,
    width=1000,
    color='Success Rate (%)'
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(xaxis_title="Topic", yaxis_title="Success Rate (%)", xaxis_tickangle=45)

fig.show()

### 7. Are there specific subtopics where students struggle more often?

In [33]:
incorrect_answers_by_subtopic = df[df['Type of Answer'] == 0].groupby('Subtopic').size()
total_answers_by_subtopic = df.groupby('Subtopic').size()
subtopic_difficulty_rate = (incorrect_answers_by_subtopic / total_answers_by_subtopic) * 100


sorted_subtopic_difficulty = subtopic_difficulty_rate.sort_values(ascending=True)
sorted_subtopic_difficulty

Subtopic
Definite Integrals                                    26.666667
Set Theory                                            35.714286
Elementary Geometry                                   41.304348
Graph Theory                                          41.818182
Eigenvalues and Eigenvectors                          43.076923
Linear Systems                                        43.809524
Double Integration                                    44.444444
Differential Equations                                46.296296
Linear Transformations                                49.647391
Matrices and Determinants                             49.666667
Analytic Geometry                                     51.117318
Statistics                                            52.058824
Vector Spaces                                         53.619498
Complex Numbers                                       54.560811
Nonlinear Optimization                                56.349206
Integration Techniques         

In [34]:
subtopic_difficulty_df = sorted_subtopic_difficulty.reset_index(name='Difficulty Rate (%)')

fig = px.bar(
    subtopic_difficulty_df,
    y='Subtopic',
    x='Difficulty Rate (%)', 
    title='Subtopics Where Students Struggle the Most',
    labels={'Difficulty Rate (%)': 'Difficulty Rate (%)', 'Subtopic': 'Subtopic'},
    text='Difficulty Rate (%)',
    height=600,
    width=1000,
    color='Difficulty Rate (%)',
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(xaxis_title="Difficulty Rate (%)", yaxis_title="Subtopic", yaxis=dict(tickmode='linear'))

fig.show()