Claims Project

## Infrastructure
- VSH: Amazon EC2
- Availability zone: us-east-1d
- Instance type: C3
- Application: Compute Optimized
- Model: c3.2xlarge
- vCPU: 8
- Mem (GiB): 15
- SSD Storage (GB): 2x80
- Operational system: Windows Server 2012 R2
- Platform: Anaconda2 64 bits
- Environment: Jupyter Notebook

### Features:
- High Frequency Intel Xeon E5-2680 v2 (Ivy Bridge) Processors
- Support for Enhanced Networking
- Support for clustering
- SSD-backed instance storage

## Coding

In [1]:
# Import libraries

# Warnings
import warnings
warnings.filterwarnings("ignore") 

# Basics
import pandas as pd
import numpy as np
from time import time

# Transform and scale
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler

# Cross validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

# Classifiers
from sklearn.tree import DecisionTreeRegressor
# from sklearn import linear_model
from sklearn import ensemble


  from numpy.core.umath_tests import inner1d


In [2]:
### CODING TIME 1: LOAD THE DATA

import pandas as pd
import numpy as np
from time import time

start = time()

### TODO: Load train and test dataframes from CSV files.
### Define the filepath and the index column.
### Call the variables 'train' and 'test'.
train = pd.read_csv(???, index_col=???)
test = pd.read_csv(???, index_col=???)

end = time()
print "Load CSVs in {:.1f} seconds.\n".format(end - start)

SyntaxError: invalid syntax (<ipython-input-2-0890007ab2fd>, line 12)

In [None]:
# Print Data Shape
print train.shape
print test.shape

In [None]:
# Print Columns
print list(train.columns)
print ""
print list(test.columns)

In [None]:
### CODING TIME 1: LOAD THE DATA

# Define variables
loss = train[???]

### TODO: After defining the loss variable as the loss column of the dataframe, define the features variable.
### Call the new variable 'features'.
features = train.drop(???, ???)

### TODO: Concatenate the training and test set to the same DataFrame.
### Call the new variable 'train_test'
train_test = pd.concat((???, ???))

numeric_features = list(train_test.dtypes[train_test.dtypes != "object"].index)
categorical_features = list(features.drop(numeric_features, 1))

In [None]:
# Describe Data
print "Train_Test Data Set Describe"
print train_test.describe()
print "\nLoss Describe"
print loss.describe()

In [None]:
### CODING TIME 2: UNDERSTAND THE DATA

# Print Skew

### TODO: Show the skew of each numerical feature for the train_test set and the loss.
### Tip: The skew method will ignore categorical features, if present
print "Train_Test Data Set Skew"
print train_test.???()
print "\nLoss Skew"
print loss.???()

In [None]:
### CODING TIME 2: UNDERSTAND THE DATA

# Numerical Features Visualization

# Import plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Create a dataframe with only numerical features
data = train_test[numeric_features] 

### TODO: Get the names of the columns
### Call the variable 'cols'
cols = data.??? 

# Plot violin for all attributes in a 7x2 grid
n_cols = 2
n_rows = 7

### TODO: Show violin plots for all numerical features
for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,figsize=(12, 8))
    for j in range(n_cols):
        sns.violinplot(y=cols[i*n_cols+j], data=data, ax=ax[j])

In [None]:
### CODING TIME 3: UNDERSTAND THE DATA (continue)

# Categorical Features Visualization

data = train_test[categorical_features]
cols = data.columns

# Count plot for all categorical features in a 29x4 grid
n_cols = 4
n_rows = 29

### TODO: Show count plots for all categorical features
for i in range(n_rows):
    fg,ax = plt.subplots(nrows=1,ncols=n_cols,sharey=True,figsize=(12, 8))
    for j in range(n_cols):
        sns.countplot(x=cols[i*n_cols+j], data=data, ax=ax[j])

In [None]:
### CODING TIME 4: THE TARGET VARIABLE

### TODO: Show violin plots for the target variable

# Loss Visualization
sns.violinplot(data=train,y="???")
plt.show()

In [None]:
### CODING TIME 5: PRE-PROCESSING THE DATA

# Transform data

from scipy.stats import boxcox

### TODO: Apply the boxcox function for all numerical features
### which are skewed more than 0.25 or less than -0.25
def aply_boxcox(df, cols, fact):
    skewed_feats = df[cols].apply(lambda x: ???(???))
    skewed_feats = skewed_feats[abs(???) > fact].index
    for feat in skewed_feats:
        df[feat], lam = ???(df[feat]+1)

### TODO: Apply pandas.factorize to all categorical features
### You may want to improve it in the future to make it one hot encoding
def factorize_features(df, cols):
    for col in cols:
        df[col] = pd.???(???, sort=???)[???]

start = time()

aply_boxcox(train_test, numeric_features, 0.25)
loss, loss_lam = boxcox(loss)
factorize_features(train_test, categorical_features)
train_rows = train_test.loc[features.index]
test_rows = train_test.loc[test.index]

end = time()
print "Transform data in {:.1f} seconds.".format(end - start)

In [None]:
print(train_test.cat100)

In [None]:
### CODING TIME 6: SCALING THE DATA

# Scale data
from sklearn.preprocessing import StandardScaler

### TODO: Fit the train_test data to the scaler
### TODO: Transform train and test data sets using the scaler parameter
### TODO: Also scale the loss variable

def scale_data(X, scaler=None):
    if not scaler:
        scaler = ???()
        scaler.???(X)
    X = scaler.???(X)
    return X, scaler

start = time()

_, scaler = scale_data(train_test)
train, _ = scale_data(train_rows, scaler)
test, _ = scale_data(test_rows, scaler)
y, scaler = scale_data(loss.reshape(-1, 1))

end = time()
print "Scale data in {:.1f} seconds.".format(end - start)

In [None]:
### CODING TIME 7: DEFINING THE PARAMETERS

# Define Grid Search Function
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

### TODO: Create a r2 score to be used in the grid search.
### TODO: use ShuffleSplit object to define the cross validation method the Grid Search
### TODO: Create the Grid Search object and return it

def fit_model(X, y, estim, params):
    score = make_scorer(???, greater_is_better=???)
    sss = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 86401)
    grid = GridSearchCV(estimator=???, param_grid=???, scoring=???, cv=???, n_jobs=6)
    grid = grid.???(X, y)
    return grid

In [None]:
### CODING TIME 8: DIFFERENT ML ALGORITHMS
### TODO: Fit Bayesian Ridge

from sklearn.linear_model import BayesianRidge

start = time()

bay = BayesianRidge()
param_bay = {'n_iter':[2, 4, 8]}
bay_grid = fit_model(???, ???, ???, ???)

end = time()
print "BAY grid search in {:.1f} seconds.".format(end - start)
print "BAY best score:  {:.4f}".format(bay_grid.best_score_)
print bay_grid.best_estimator_

In [None]:
### CODING TIME 8: DIFFERENT ML ALGORITHMS
### TODO: Fit GradientBoostingRegressor

start = time()

gbr = ensemble.GradientBoostingRegressor()
# param_gbr = {'n_estimators' : [25,50, 200], 'max_depth' : [6, 8, 10, 12], 'random_state' : [864]
param_gbr = {'n_estimators' : [25], 'max_depth' : [10], 'random_state' : [864]}
gbr_grid = fit_model(???, ???, ???, ???)

end = time()
print "GBR grid search in {:.1f} seconds.".format(end - start)
print "GBR best score:  {:.4f}".format(gbr_grid.best_score_)
print gbr_grid.best_estimator_

In [None]:
start = time()

y_gbr = gbr_grid.best_estimator_.predict(test)
y_gbr = scaler.inverse_transform(y_gbr)
y_gbr = np.exp(np.log(loss_lam*y_gbr+1)/loss_lam)

end = time()
print "GBR predict in {:.1f} seconds.".format(end - start)

In [None]:
test_raw = pd.read_csv('data/test.csv', index_col='id')

In [None]:
df = pd.DataFrame()
df.insert(0, 'id', list(test_raw.index))
df.insert(1, 'loss', list(y_gbr))
df.to_csv('data/submission.csv',index=False)

In [None]:

# Loss Predicted Visualization

sns.violinplot(data=y_gbr)
plt.show()
print pd.DataFrame({'predicted': y_gbr}).describe()