# Preprocessing

In [3]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import plot_confusion_matrix, auc, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.over_sampling import SMOTE
from collections import Counter

df = pd.read_csv('../../../data/EDA.csv')

# Preprocessing

## Clarifying important predictor values (x) for the target value (y).

In [4]:
# dropping the target (y) values from x and specific origin
features = df.drop(['ref', 'company_manufacturer', 'company_location', 'review_date', 'country_of_bean_origin', 'specific_bean_origin_or_bar_name', 'cocoa_percent','ingredients', 'most_memorable_characteristics', 'rating', 'rating_class'], axis =1)

X = features
y = df.rating_class


### Splitting the data into three subsets of training and validation data for the future models.
    Two train test splits create three subsets of the original dataset which allows for the training data to not be bled into the test data - this reduced model's bias towards the pre-existing testing data, thus assuring maximum performance on future test sets to which the model has never been exposed. 

In [7]:
#performing train test split for test set (subsets 1/3)
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=.15, random_state=42)


#performing a train test split for train and validation set (subsets - 3/3)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=.15, random_state=42) 

In [6]:
#checking to ensure the shape of the columns and rows are still the same for the X and y values after the train test split.
X_tr.shape, y_tr.shape

((1896, 8), (1896,))

# Replacing any existing missing values

In [None]:
#ingredients have 88 rows that have no imputs.
X_val.isna().sum()

In [None]:
# the empty value replacement will be done using the most frequent fill strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns = X.columns)

X_val_imputed = pd.DataFrame(imputer.fit_transform(X_val),columns = X.columns)

In [None]:
#ensuring that the missing values from ingredients are now filled in with the most frequent occuring value

X_val_imputed.isna().sum()

### Separating groups into numeric and catagorical data types 

In [None]:
#taking a look at the groups data types to ensure that they are separated correctly
X_train.info()

### One Hot Encode Categorical Features 

In [8]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [10]:
# fit on training categorical data
ohe.fit(X_train)
X_train_encoded = ohe.transform(X_train)
X_val_encoded = ohe.transform(X_val)
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names())
X_val_encoded_df= pd.DataFrame(X_val_encoded, columns=ohe.get_feature_names())

In [11]:
#put the encoded and origin splitting together
X_train_df = pd.concat([X_train, X_train_encoded_df], axis=1)
X_val_df = pd.concat([X_val, X_val_encoded_df], axis=1)

In [13]:
#dropping the bins 
X_train_df = X_train_df.drop(['review_date_bin','bean_origin_bins','comp_manufact_bin','ingredient_list','cocoa_bucket'], axis=1)
X_val_df = X_val_df.drop(['review_date_bin','bean_origin_bins','comp_manufact_bin','ingredient_list','cocoa_bucket'], axis=1)

In [14]:
df.to_csv('../../../data/feature_engineering.csv', index=False)