This pipeline consists of common processing steps for data and can be reusable across different datasets.

You just need to change the input data (including feature and label columns) and the lists of numeric/class columns

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/IT7143\ Module\ 5

In [None]:
import pandas as pd
import numpy as np

students = pd.read_csv('students_m5.csv')
students

Unnamed: 0,StudentID,FirstName,LastName,Major,HighSchoolGPA,FamilyIncome,State,AvgDailyStudyTime,TotalAbsence,FirstYearGPA,isGRA
0,202303595,Baxter,Dengler,Computer Science,2.82,45013,WA,2.01,14.0,1.93,0
1,202309162,Christian,Wickey,Data Science,3.07,128358,GA,5.41,,2.76,0
2,202306337,Lonnie,Wulff,Software Engineering,2.68,112392,GA,9.57,13.0,3.09,0
3,202306072,Mitchell,Deshotel,Software Engineering,3.21,190846,GA,8.57,16.0,3.08,0
4,202301733,Linwood,Willing,Information Technology,3.44,187163,GA,6.24,20.0,2.73,0
...,...,...,...,...,...,...,...,...,...,...,...
995,202302372,Michael,Richman,Computer Science,4.00,32210,SC,8.84,16.0,3.31,1
996,202309892,Lacy,Anton,Software Engineering,3.02,163481,GA,6.61,17.0,2.53,0
997,202308310,Ell,Benke,Software Engineering,2.05,45446,GA,3.68,30.0,1.77,0
998,202305648,Elzie,Enderle,Information Technology,2.19,44714,GA,2.74,17.0,2.11,0


Change the columns to drop and label to fit your data

In [None]:
features = students.drop(['StudentID','FirstName','LastName','FirstYearGPA','isGRA'], axis=1)
labels = students['FirstYearGPA']

from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(features, labels, test_size=0.2)

Change the two lists below to fit your data

In [None]:
num_cols = ['HighSchoolGPA','FamilyIncome','AvgDailyStudyTime','TotalAbsence']
cat_cols = ['Major','State']

In [None]:
#function to clip outliers
def outlier_clip(data):
    num_sds = trainX[num_cols].std()
    num_means = trainX[num_cols].mean()
    return np.clip(data, num_means - 4*num_sds, num_means + 4*num_sds, axis=1)    #you can change 4 to other numbers

#function to log transform
def log_transform(data):
    return pd.concat([data, np.log(data.add_suffix('_log') + 0.001)], axis=1)     #you can change 0.001 to other numbers

#function to remove rare classes
def remove_rare_classes(data):
    data_copy = data.copy()
    kept_classes = {}
    for col in cat_cols:
        cat_counts = trainX[col].value_counts()
        kept_classes[col] = cat_counts.index[cat_counts > 40]                     #you can change 40 to other numbers
    for col in cat_cols:
        data_copy.loc[~data_copy[col].isin(kept_classes[col]), col] = 'Other'
    return data_copy

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#pipeline for numeric columns
num_pipeline = Pipeline([
    ('outlier clip', FunctionTransformer(outlier_clip)),
    ('log transform', FunctionTransformer(log_transform)),
    ('standardize', StandardScaler()),
    ('impute', SimpleImputer(strategy='median'))
])

#pipeline for class columns
cat_pipeline = Pipeline([
    ('remove rare classes', FunctionTransformer(remove_rare_classes)),
    ('encode', OneHotEncoder())
])

from sklearn.compose import ColumnTransformer

#combining
full_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#use the built pipeline to process training and testing data
trainX_prc = full_pipeline.fit_transform(trainX)
testX_prc = full_pipeline.transform(testX)

Models can now be trained and tested with trainX_prc and testX_prc