<h1><center>Data Preprocessing</center></h1>

## 1. Train/Test Split
- Sklearn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state= 21)

## 2. Cross Validation

In [1]:
from sklearn.model_selection import cross_val_score

In [None]:
reg = LinearRegession()
cv_results = cross_val_score(reg,X,y, cv = 5)

In [None]:
print(np.mean(cv_results))

## 3. Feature selection
### 3.1 Regularization I: Lasso

In [None]:
from sklearn.linear_model import Lasso

# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)

# Fit the regressor to the data
lasso.fit(X,y)

# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()

### 3.2 Regularization II: Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(normalize=True)
alpha_space = np.logspace(-4, 0, 50)

# Compute scores over range of alphas
for alpha in alpha_space:

    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))


### 3.3 Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio': l1_space}

# An 'l1_ratio' of 1 corresponds to an L1 penalty, and anything lower is a combination of L1 and L2
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)

# Fit it to the training data
gm_cv.fit(X_train, y_train)

## 4. Dummy Variables

In [None]:
# Create dummy variables: df_region
df_region = pd.get_dummies(df)

# Print the columns of df_region
print(df_region.columns)

# Create dummy variables with drop_first=True: df_region
df_region = pd.get_dummies(df, drop_first=True)

# Print the new columns of df_region
print(df_region.columns)

## 5. Handling missing data

In [None]:
# Convert '?' to NaN
df[df == '?'] = np.nan

# Print the number of NaNs
print(df.isnull().sum())

In [None]:
# Drop missing data
df = df.dropna()

In [None]:
# Replace with mean
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(X)
X = imp.transform(X)

## 6. Scaling and Normalizing

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X)
X = scaler.transform(X)

## 7. Pipeline 

In [None]:
from sklearn.pipeline import Pipeline

steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

cv = GridSearchCV(pipeline, param_grid=parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

## 8. One Hot-Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
enc.fit(y)

In [None]:
enc.transform(y).toarray()

## 9. Data Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
train_datagen = ImageDataGenerator(rotation_range=360,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   validation_split=0.15,
                                   preprocessing_function=preprocess_image, 
                                   rescale=1 / 128.)

In [None]:
train_generator = train_datagen.flow_from_dataframe(df_train, 
                                                    x_col='id_code', 
                                                    y_col='diagnosis',
                                                    directory = '../input/aptos2019-blindness-detection/train_images',
                                                    target_size = (512, 512),
                                                    batch_size = BATCH_SIZE,
                                                    class_mode = 'raw', 
                                                    subset='training')

In [None]:
val_generator = train_datagen.flow_from_dataframe(df_train, 
                                                    x_col='id_code', 
                                                    y_col='diagnosis',
                                                    directory = '../input/aptos2019-blindness-detection/train_images',
                                                    target_size = (512, 512),
                                                    batch_size = BATCH_SIZE,
                                                    class_mode = 'raw', 
                                                    subset='validation')