In [None]:
import altair as alt
import joblib
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from typing import List, Set
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score

In [None]:
RANDOM_SEED = 17
df_type = 'wv_pca_df'
pca_components = 30
use_topics=True
project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')
print(f'DF type: {df_type}')

In [None]:
# read the dfs in

if df_type == 'use_simple_df':
    X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','X_updated.csv'))
    kaggle_X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','kaggle_X_updated.csv'))

elif df_type == 'wv_pca_df':
    X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'X_pca_updated.csv'))
    kaggle_X = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'kaggle_X_pca_updated.csv'))
    wv_col_names = [f'wv_pca_cols{i}' for i in range(pca_components)]

else:
    print('Please Choose valid df_type')

y = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','y_updated.csv'))
kaggle_y = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data','kaggle_y_updated.csv'))

### Data Check and Simple Clean

In [None]:
# helpful to know how the categorical features should be treated 
# and if any features should be removed

# for col_name in list(X):
#     if X[col_name].value_counts().shape[0] == 1:
#         print(f'**WARNING** Column "{col_name}" has {X[col_name].value_counts().shape[0]} unique features.  REMOVE\n')
#     elif X[col_name].value_counts().shape[0] == 2:
#         print(f'**WARNING** Column "{col_name}" has {X[col_name].value_counts().shape[0]} unique features.  Should be encoded as boolean.\n')
#     else:
#         print(f'Column "{col_name}" has {X[col_name].value_counts().shape[0]} unique features\n')
        
        

In [None]:
# drop columns with only 1 unique value
X.drop(['Bigram_avg', 'Bigram_max'], inplace=True, axis=1)
kaggle_X.drop(['Bigram_avg', 'Bigram_max'], inplace=True, axis=1)

In [None]:
if use_topics:
    topics = (pd.read_csv(os.path.join(project_base,  'data', 'WikiLarge_Train_With_Topics.csv'))['topic']).astype(str)
    X['topic'] = topics

### Split into Test and Train Sets

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=RANDOM_SEED)
print(f'train set size: {X_train.shape}')
print(f'test set size: {X_test.shape}')

### Preprocessing

In [None]:
num_cols = ['word_count',
 'dale_chale_overlap_count',
 'Nletters_max',
 'Nphon_max',
 'Nsyll_max',
 'AoA_Kup_max',
 'Perc_known_max',
 'AoA_Kup_lem_max',
 'Nletters_avg',
 'Nphon_avg',
 'Nsyll_avg',
 'AoA_Kup_avg',
 'Perc_known_avg',
 'AoA_Kup_lem_avg',
 'Conc.M_max',
 'Conc.SD_max',
 'Total_max',
 'Percent_known_max',
 'SUBTLEX_max',
 'Conc.M_avg',
 'Conc.SD_avg',
 'Total_avg',
 'Percent_known_avg',
 'SUBTLEX_avg',
 'Unknown_max', 
 'Unknown_avg']
try:
    num_cols += wv_col_names
except NameError:
    print('WVs not used')

if use_topics:
    cat_cols = ['topic']
bool_cols = []

print(f'Column total count: {len(num_cols)+(len(cat_cols))+(len(bool_cols))}')
print(f'X column count: {X_train.shape[1]}')

# make sure we didnt miss any columns
assert (len(num_cols))+(len(cat_cols))+(len(bool_cols)) == X_train.shape[1]

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

bool_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='error', drop='first', sparse=False))
])

# 
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols),
    ('bool', bool_transformer, bool_cols)
])

preprocessor.fit(X_train)

In [None]:
if len(cat_cols) != 0:
    # recreate pandas df with names
    # mostly useful for feature importance libraries (SHAP, LIME)
    ohe_cats = preprocessor.named_transformers_['cat'].named_steps['ohe'].categories_

    new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_cols, ohe_cats) for val in
                        vals]
    all_features = num_cols + new_ohe_features + bool_cols

    X_train_processed = pd.DataFrame(preprocessor.transform(X_train), columns=all_features,
                                     index=X_train.index).astype(float)
    X_test_processed = pd.DataFrame(preprocessor.transform(X_test), columns=all_features,
                                    index=X_test.index).astype(float)
else:
    X_train_processed = preprocessor.transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    X_kaggle_processed = preprocessor.transform(kaggle_X)
    
# 
# y_train = pd.DataFrame(y_train, columns=['target_variable'], index=y_train.index).astype(float)
# y_test = pd.DataFrame(y_test, columns=['target_variable'], index=y_test.index).astype(float)

In [None]:
if use_topics:
    X_train_processed.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data_w_topics.csv'), index=False)

    X_test_processed.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data_w_topics.csv'), index=False)

    # will need to add topics for the kaggle test set if topics improve model performance
#     processed_kaggle_df = pd.DataFrame(X_kaggle_processed, columns=num_cols)
#     processed_kaggle_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_kaggle_data_w_topics.csv'), index=False)

    
else:
    processed_train_df = pd.DataFrame(X_train_processed, columns=num_cols)
    processed_train_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data.csv'), index=False)

    processed_test_df = pd.DataFrame(X_test_processed, columns=num_cols)
    processed_test_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data.csv'), index=False)

    processed_kaggle_df = pd.DataFrame(X_kaggle_processed, columns=num_cols)
    processed_kaggle_df.to_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_kaggle_data.csv'), index=False)


In [None]:
y_train = np.array(y_train).reshape(-1)
y_test = np.array(y_test).reshape(-1)
kaggle_y = np.array(kaggle_y).reshape(-1)

In [None]:
pd.DataFrame({'actual_outcome': np.array(y_train).reshape(-1)}).to_csv(os.path.join(project_base, 
                                                                                    'data', 'cleaned_data',
                                                                                    'processed_train_y.csv'), index=False)

pd.DataFrame({'actual_outcome': np.array(y_test).reshape(-1)}).to_csv(os.path.join(project_base, 
                                                                                    'data', 'cleaned_data',
                                                                                    'processed_test_y.csv'), index=False)

pd.DataFrame({'actual_outcome': np.array(kaggle_y).reshape(-1)}).to_csv(os.path.join(project_base, 
                                                                                    'data', 'cleaned_data',
                                                                                    'processed_kaggle_y.csv'), index=False)