In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import  train_test_split

In [2]:
def rename_columns(df):
    # Rename columns for clarity
    dataset = dataset.rename(columns={
        'Y': 'StudentAchievementScore',
        'Z': 'GrowthMindsetIntervention',
        'S3': 'FutureSuccessExpectations',
        'C1': 'StudentRaceEthnicity',
        'C2': 'StudentGender',
        'C3': 'FirstGenCollegeStatus',
        'XC': 'SchoolUrbanicity',
        'X1': 'PreInterventionFixedMindset',
        'X2': 'SchoolAchievementLevel',
        'X3': 'SchoolMinorityComposition',
        'X4': 'PovertyConcentration',
        'X5': 'TotalStudentPopulation'
    })
    return dataset

In [3]:
def treatment_outcome_and_control():
    # Define covariates, treatment, and outcome
    covariate_cols = ['FutureSuccessExpectations', 'StudentRaceEthnicity', 'StudentGender', 'FirstGenCollegeStatus', 
                    'SchoolUrbanicity', 'PreInterventionFixedMindset', 'SchoolAchievementLevel', 
                    'SchoolMinorityComposition', 'PovertyConcentration', 'TotalStudentPopulation']
    treatment_col = 'GrowthMindsetIntervention'
    outcome_col = 'StudentAchievementScore'
    return covariate_cols, treatment_col, outcome_col

In [4]:
def split_train_test_data(dataset, treatment_col):
    # Split data into training and testing sets (with stratification on treatment variable)
    train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset[treatment_col])
    return train_data, test_data

In [5]:
def staderize_categorical_and_numerical_features():
    # Preprocessing: Standardize continuous variables and one-hot encode categorical variables
    continuous_cols = ['PreInterventionFixedMindset', 'SchoolAchievementLevel', 'SchoolMinorityComposition', 
                    'PovertyConcentration', 'TotalStudentPopulation']

    categorical_cols = ['StudentRaceEthnicity', 'StudentGender', 'FirstGenCollegeStatus', 'SchoolUrbanicity']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), continuous_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ]
    )    
    return preprocessor

In [6]:
def preprocessor():
    # Load  data
    dataset = pd.read_csv('../data/dataset.csv')

    dataset = rename_columns(dataset)

    covariate_cols, treatment_col, outcome_col = treatment_outcome_and_control()

    # Split data into training and testing sets (with stratification on treatment variable)
    # since the distribution of data across treated and control population is imbalanced
    train_data, test_data = split_train_test_data(dataset,treatment_col)

    preprocessor = staderize_categorical_and_numerical_features()

    # 6. Fit and transform the training data (covariates only)
    X_train_processed = preprocessor.fit_transform(train_data[covariate_cols])
    y_train = train_data[outcome_col].values
    treatment_train = train_data[treatment_col].values

    return X_train_processed, y_train, treatment_train, test_data

* Renamed the columns 
* Defined covariate tretment and outcome column
* Spit train and test dataset into 80/20 %
* standarizes the contineous columns and onehotencoded the categorical columns 
