In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

In [2]:
# Load data into pandas DataFrame
data = pd.read_csv('../data/Student data.csv',skiprows=24, header=None)

In [3]:
# Convert data to DataFrame
df = pd.DataFrame(data)


In [4]:
columns = ['firstTermGpa', 'secondTermGpa', 'firstLanguage', 'funding', 'school', 'fastTrack', 'coop', 'residency',
           'gender', 'previousEducation', 'ageGroup', 'highSchoolAverageMark', 'mathScore', 'englishScore', 'firstYearPersistence']

In [5]:
df.columns = columns


In [6]:

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

In [7]:
# Define numeric and categorical columns
numeric_features = ['firstTermGpa', 'secondTermGpa', 'highSchoolAverageMark', 'mathScore',]
categorical_features = ['firstLanguage', 'funding', 'school', 'fastTrack', 'coop', 'residency', 'gender', 'previousEducation', 'ageGroup','englishScore',]

In [10]:
# Create preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [12]:
# Apply the pipeline to the data
df_processed = preprocessor.fit_transform(df)

In [13]:
# Get the column names for numeric and one-hot encoded features 
numeric_columns = numeric_features 
encoded_categories = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features) 
all_columns = list(numeric_columns) + list(encoded_categories) 

# Convert the result back to a DataFrame with column names 
df_processed = pd.DataFrame(df_processed, columns=all_columns) 

print(df_processed)

      firstTermGpa  secondTermGpa  highSchoolAverageMark     mathScore  \
0        -2.437302      -2.651716              -2.165197 -1.877967e+00   
1        -0.296368      -0.771183               0.000000  8.058823e-16   
2         1.202286       1.037021               1.770931  9.574794e-01   
3         0.149660      -0.468955               0.000000  8.058823e-16   
4         1.223695       1.416744               2.367314  8.058823e-16   
...            ...            ...                    ...           ...   
1432     -1.227209      -2.651716               0.000000  1.637986e+00   
1433      1.296487       1.178998               0.000000  8.058823e-16   
1434     -0.706714      -0.348064               0.101059 -1.651131e+00   
1435      1.006809       1.445158               0.000000  1.524569e+00   
1436     -0.610372       0.362667               0.458888 -6.303706e-01   

      firstLanguage_1  firstLanguage_2  firstLanguage_3  funding_1  funding_2  \
0                 1.0         

In [14]:
# Save the cleaned data to a csv file
df_processed.to_csv('../data/fixed_data.csv', index=False)
print("Cleaned data saved to 'fixed_data.csv'")

Cleaned data saved to 'fixed_data.csv'


In [15]:
# Save the preprocessor to a file 
joblib.dump(preprocessor, '../models/preprocessor_pipeline.pkl')

['../models/preprocessor_pipeline.pkl']