# Predicția Scorului la Examen (Exam Score Prediction)

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Load Data

In [2]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")
df_train

Unnamed: 0,ID,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,...,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,1,27,79,Low,High,Yes,8,63,High,Yes,...,Low,Medium,Public,Negative,5,No,College,Moderate,Female,69
1,2,16,86,High,Medium,Yes,7,94,Medium,Yes,...,Low,High,Public,Neutral,3,No,High School,Moderate,Female,69
2,3,22,87,Low,Medium,No,8,83,Low,Yes,...,Low,Medium,Public,Neutral,1,No,College,Far,Male,66
3,4,18,100,High,Medium,Yes,10,86,Medium,Yes,...,Medium,Medium,Public,Neutral,3,No,High School,Near,Male,72
4,5,35,78,High,Low,Yes,10,99,Medium,Yes,...,Low,Medium,Private,Positive,2,No,High School,Near,Male,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5280,5281,15,82,Medium,Medium,Yes,7,93,Medium,Yes,...,Low,High,Public,Negative,2,No,High School,Moderate,Female,66
5281,5282,20,65,Medium,Medium,Yes,8,97,High,Yes,...,Low,Medium,Public,Negative,3,No,College,Near,Female,65
5282,5283,17,64,High,Low,Yes,10,63,Medium,Yes,...,High,Medium,Public,Positive,3,No,High School,Moderate,Female,62
5283,5284,16,100,High,High,Yes,7,82,Medium,Yes,...,High,Medium,Public,Positive,2,No,High School,Near,Male,73


## Subtask 1

In [3]:
avg_studied = df_train["Hours_Studied"].mean().item()
avg_studied

19.967076631977296

In [4]:
abs_diff_studied = (df_test["Hours_Studied"] - avg_studied).abs()
abs_diff_studied

0       0.032923
1       2.032923
2       1.032923
3       7.967077
4       1.032923
          ...   
1317    0.032923
1318    7.032923
1319    3.032923
1320    1.032923
1321    3.032923
Name: Hours_Studied, Length: 1322, dtype: float64

In [5]:
subtask1_rows = []
for id_, val in zip(df_test["ID"], abs_diff_studied):
    subtask1_rows.append((1, id_, val))

## Subtask 2

In [6]:
sleep_less = df_test["Sleep_Hours"] < 7
sleep_less

0       False
1       False
2        True
3       False
4       False
        ...  
1317    False
1318    False
1319     True
1320     True
1321     True
Name: Sleep_Hours, Length: 1322, dtype: bool

In [7]:
subtask2_rows = []
for id_, val in zip(df_test["ID"], sleep_less):
    subtask2_rows.append((2, id_, val))

## Subtask 3

In [8]:
num_ge_score = df_test["Previous_Scores"].map(lambda x: (df_train["Previous_Scores"] >= x).sum())
num_ge_score

0       1424
1        267
2       5028
3       2067
4        580
        ... 
1317    1651
1318     902
1319    4199
1320    5285
1321     478
Name: Previous_Scores, Length: 1322, dtype: int64

In [9]:
subtask3_rows = []
for id_, val in zip(df_test["ID"], num_ge_score):
    subtask2_rows.append((3, id_, val))

## Subtask 4

In [10]:
train_motiv_levels = df_train["Motivation_Level"].value_counts().to_dict()
train_motiv_levels

{'Medium': 2688, 'Low': 1558, 'High': 1039}

In [11]:
eq_motiv_level = df_test["Motivation_Level"].map(lambda x: train_motiv_levels[x])
eq_motiv_level

0       1039
1       1558
2       1039
3       1558
4       2688
        ... 
1317    2688
1318    2688
1319    1039
1320    1558
1321    2688
Name: Motivation_Level, Length: 1322, dtype: int64

In [12]:
subtask4_rows = []
for id_, val in zip(df_test["ID"], eq_motiv_level):
    subtask4_rows.append((4, id_, val))

## Subtask 5

In [13]:
X_train = df_train.copy()
X_train, y_train = X_train.drop(["ID", "Exam_Score"], axis=1), X_train["Exam_Score"]
X_test = df_test.copy().drop("ID", axis=1)

In [14]:
X_train.dtypes

Hours_Studied                  int64
Attendance                     int64
Parental_Involvement          object
Access_to_Resources           object
Extracurricular_Activities    object
Sleep_Hours                    int64
Previous_Scores                int64
Motivation_Level              object
Internet_Access               object
Tutoring_Sessions              int64
Family_Income                 object
Teacher_Quality               object
School_Type                   object
Peer_Influence                object
Physical_Activity              int64
Learning_Disabilities         object
Parental_Education_Level      object
Distance_from_Home            object
Gender                        object
dtype: object

In [15]:
cat_cols = []
num_cols = []
for col in X_train.columns:
    if X_train[col].dtype == "object":
        cat_cols.append(col)
        print(f"{col:<30}", X_train[col].unique())
    else:
        num_cols.append(col)

Parental_Involvement           ['Low' 'High' 'Medium']
Access_to_Resources            ['High' 'Medium' 'Low']
Extracurricular_Activities     ['Yes' 'No']
Motivation_Level               ['High' 'Medium' 'Low']
Internet_Access                ['Yes' 'No']
Family_Income                  ['Low' 'Medium' 'High']
Teacher_Quality                ['Medium' 'High' nan 'Low']
School_Type                    ['Public' 'Private']
Peer_Influence                 ['Negative' 'Neutral' 'Positive']
Learning_Disabilities          ['No' 'Yes']
Parental_Education_Level       ['College' 'High School' 'Postgraduate' nan]
Distance_from_Home             ['Moderate' 'Far' 'Near' nan]
Gender                         ['Female' 'Male']


In [16]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_cols),
    ("num", StandardScaler(), num_cols)
])

In [17]:
pipeline = Pipeline([
    ("pre", preprocessor),
    ("reg", LinearRegression())
])

In [18]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error") * -1

array([0.4548276 , 0.43566711, 0.50935449, 0.5228945 , 0.6124915 ])

In [19]:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

In [20]:
subtask5_rows = []
for id_, val in zip(df_test["ID"], preds):
    subtask5_rows.append((5, id_, val))

## Save answers

In [21]:
submission_rows = subtask1_rows + subtask2_rows + subtask3_rows + subtask4_rows + subtask5_rows
df_submission = pd.DataFrame(submission_rows, columns=["subtaskID", "datapointID", "answer"])
df_submission.to_csv("submission.csv", index=False)

## Submission results

Subtask 1:
- Accuracy: 1
- Score: 10/10

Subtask 2:
- Accuracy: 1
- Score: 10/10

Subtask 3:
- Accuracy: 1
- Score: 10/10

Subtask 4:
- Accuracy: 1
- Score: 10/10

Subtask 5:
- MAE: 0.449949
- Score: 60/60