## Capstone 2 Machine Learning - Logistic Regression

Unable to create an effective model using Linear Regression.  Multinomial Logistic Regression appeared to be another means of predicting tier selections as the selections are technically agregated sums of tier choices.

In [1]:
# Import Tools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer

# Make plots pretty.
sns.set()

In [2]:
# Instantiate Regressor, Steps, and Pipeline with Newtpm-CG Solver
n_log = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=42, max_iter=200,
                        n_jobs = 4)
n_steps = [('scaler',StandardScaler()), ('log',n_log)]
n_pipe = Pipeline(n_steps)

# Instantiate Regressor, Steps, and Pipeline with SAGA Solver
saga_log = LogisticRegression(multi_class='multinomial', solver='saga', random_state=42, max_iter=300,
                        n_jobs = 4)
saga_steps = [('scaler',StandardScaler()), ('log', saga_log)]
saga_pipe = Pipeline(saga_steps)


***
# Values - All Scaled to Percent of Total Plan Selections

In [11]:
# Read in Data
dfap = pd.read_csv('Combo_All_Percents.csv')
dfap.drop('Unnamed: 0', axis=1, inplace=True)
dfap.head()

Unnamed: 0,cfc,state,county,0-17,18-25,26-34,35-44,45-54,55-64,65+,...,platinum,gold,silver,bronze,catastrophic,new,active_renew,auto_renew,tps,Year
0,12109,FL,St. Johns County,0.1149,0.0997,0.1429,0.1536,0.2148,0.2703,0.0037,...,0.1,0.05,0.66,0.18,0.01,0.536,0.3452,0.1188,10960.0,2015
1,10001,DE,Kent County,0.1209,0.0831,0.1548,0.1551,0.2256,0.2552,0.0053,...,0.03,0.15,0.63,0.18,0.0,0.5423,0.1917,0.266,3417.0,2015
2,10003,DE,New Castle County,0.1312,0.0852,0.1662,0.1559,0.2197,0.2334,0.0084,...,0.05,0.16,0.59,0.2,0.01,0.498,0.2232,0.2789,14967.0,2015
3,10005,DE,Sussex County,0.1194,0.0656,0.121,0.1397,0.2185,0.3314,0.0045,...,0.04,0.17,0.63,0.16,0.0,0.5124,0.2302,0.2574,6651.0,2015
4,12001,FL,Alachua County,0.0571,0.1699,0.215,0.1468,0.1769,0.2312,0.0032,...,0.09,0.03,0.72,0.15,0.01,0.5066,0.3442,0.1492,12719.0,2015


In [12]:
# Create the X and y for the model with individual y's for each metallic tier.

# Our X Dropping the Metallic Tier Targets, String Data, and Categorical Year
Xap = dfap.drop(['cfc','state','county','platinum','gold','silver','bronze','catastrophic', 'Year'],
              axis=1).values

# Individual y's for Each Target Data
ycatap = (dfap.catastrophic.values * 100).astype(int)
ybroap = (dfap.bronze.values *100).astype(int)
ysilap = (dfap.silver.values * 100).astype(int)
ygoldap = (dfap.gold.values * 100).astype(int)
yplatap = (dfap.platinum.values * 100).astype(int)

# Create List of Target Values for Percent Data and set up Taget Names
ysap=[ycatap, ybroap, ysilap, ygoldap, yplatap]
ys_name = ['Catastrophic','Bronze','Silver','Gold','Platinum']

## Newton - CG - Solver

In [5]:
# Let's take a look at our predictive capabilites of the Logistic Regressor

# Catastropic Tier
X_train, X_test, y_train, y_test = train_test_split(Xap,ycatap, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7134300126103404
Catastrophic Test Accuracy: 0.7235294117647059


In [6]:
# Bronze Tier
X_train, X_test, y_train, y_test = train_test_split(Xap,ybroap, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Bronze Training Accuracy: {}'.format(train_acc))
print('Bronze Test Accuracy: {}'.format(test_acc))

Bronze Training Accuracy: 0.09079445145018916
Bronze Test Accuracy: 0.08161764705882353


In [7]:
# Silver Tier
X_train, X_test, y_train, y_test = train_test_split(Xap,ysilap, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Silver Training Accuracy: {}'.format(train_acc))
print('Silver Test Accuracy: {}'.format(test_acc))

Silver Training Accuracy: 0.057692307692307696
Silver Test Accuracy: 0.052941176470588235


In [8]:
# Gold Tier
X_train, X_test, y_train, y_test = train_test_split(Xap,ygoldap, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Gold Training Accuracy: {}'.format(train_acc))
print('Gold Test Accuracy: {}'.format(test_acc))

Gold Training Accuracy: 0.35718789407314
Gold Test Accuracy: 0.34705882352941175


In [9]:
# Platinum Tier
X_train, X_test, y_train, y_test = train_test_split(Xap,yplatap, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.8114754098360656
Platinum Test Accuracy: 0.8213235294117647


## Scaled SAGA - Solver

In [10]:
# Catastropic Tier - Scaled
X_train, X_test, y_train, y_test = train_test_split(Xap,ycatap, test_size=.3, random_state=42)
saga_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7616645649432535
Catastrophic Test Accuracy: 0.7411764705882353




In [11]:
# Bronze Tier - Scaled
X_train, X_test, y_train, y_test = train_test_split(Xap,ybroap, test_size=.3, random_state=42)
saga_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe.predict(X_test))
print('Bronze Training Accuracy: {}'.format(train_acc))
print('Bronze Test Accuracy: {}'.format(test_acc))

Bronze Training Accuracy: 0.17118537200504413
Bronze Test Accuracy: 0.09411764705882353




Platinum and Catastrophic Plans have a much more reasonable accuracy score.  With a little hyperparameter turning the accuracies could be increased further.  The SAGA Solver is faster than Newton-CG and provides slightly better accuracy as well.
***
# Values - Target Values Scaled as Percent of Total Plan Selection

In [3]:
# Read in Data
dfp = pd.read_csv('Combo_Percents.csv')
dfp.drop('Unnamed: 0', axis=1, inplace=True)
dfp.head()

Unnamed: 0,cfc,state,county,0-17,18-25,26-34,35-44,45-54,55-64,65+,...,platinum,gold,silver,bronze,catastrophic,new,active_renew,auto_renew,tps,Year
0,12109,FL,St. Johns County,1259.0,1093.0,1566.0,1684.0,2354.0,2963.0,41.0,...,0.1,0.05,0.66,0.18,0.01,5875.0,3783.0,1302.0,10960.0,2015
1,10001,DE,Kent County,413.0,284.0,529.0,530.0,771.0,872.0,18.0,...,0.03,0.15,0.63,0.18,0.0,1853.0,655.0,909.0,3417.0,2015
2,10003,DE,New Castle County,1964.0,1275.0,2487.0,2333.0,3288.0,3494.0,126.0,...,0.05,0.16,0.59,0.2,0.01,7453.0,3340.0,4174.0,14967.0,2015
3,10005,DE,Sussex County,794.0,436.0,805.0,929.0,1453.0,2204.0,30.0,...,0.04,0.17,0.63,0.16,0.0,3408.0,1531.0,1712.0,6651.0,2015
4,12001,FL,Alachua County,726.0,2161.0,2734.0,1867.0,2250.0,2940.0,41.0,...,0.09,0.03,0.72,0.15,0.01,6443.0,4378.0,1898.0,12719.0,2015


In [4]:
# Create the X and y for the model with individual y's for each metallic tier.

# Our X Dropping the Metallic Tier Targets, String Data, Categorical Year, and TPS
Xp = dfp.drop(['cfc','state','county','platinum','gold','silver','bronze','catastrophic', 'Year',
               'tps'],axis=1).values

# Individual y's for Each Target Data
ycatp = (dfp.catastrophic.values * 100).astype(int)
ybrop = (dfp.bronze.values *100).astype(int)
ysilp = (dfp.silver.values * 100).astype(int)
ygoldp = (dfp.gold.values * 100).astype(int)
yplatp = (dfp.platinum.values * 100).astype(int)

# Create List of Target Values for Percent Data
ysp=[ycatp, ybrop, ysilp, ygoldp, yplatp]

## Newton - CG Solver

In [14]:
# Catastropic Tier Unscaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7676544766708702
Catastrophic Test Accuracy: 0.7507352941176471


In [15]:
# Catastropic Tier Scaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7528373266078184
Catastrophic Test Accuracy: 0.7419117647058824


In [16]:
# Bronze Tier Not Scaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ybrop, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Bronze Training Accuracy: {}'.format(train_acc))
print('Bronze Test Accuracy: {}'.format(test_acc))

Bronze Training Accuracy: 0.24495586380832282
Bronze Test Accuracy: 0.1426470588235294


In [17]:
# Bronze Tier Scaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ybrop, test_size=.3, random_state=42)
n_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe.predict(X_test))
print('Bronze Training Accuracy: {}'.format(train_acc))
print('Bronze Test Accuracy: {}'.format(test_acc))

Bronze Training Accuracy: 0.10907944514501891
Bronze Test Accuracy: 0.0838235294117647


## Scaled SAGA Solver

In [18]:
# Catastropic Tier - Scaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
saga_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7405422446406053
Catastrophic Test Accuracy: 0.7426470588235294




In [19]:
# Bronze Tier - Scaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ybrop, test_size=.3, random_state=42)
saga_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe.predict(X_test))
print('Bronze Training Accuracy: {}'.format(train_acc))
print('Bronze Test Accuracy: {}'.format(test_acc))

Bronze Training Accuracy: 0.08259773013871374
Bronze Test Accuracy: 0.07352941176470588




# Values - Not Scaled as Percent of Total Plan Selection

In [20]:
# Read in Data
df = pd.read_csv('Combo_Cleaned.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,cfc,state,county,0-17,18-25,26-34,35-44,45-54,55-64,65+,...,platinum,gold,silver,bronze,catastrophic,new,active_renew,auto_renew,tps,Year
0,12109,FL,St. Johns County,1259.0,1093.0,1566.0,1684.0,2354.0,2963.0,41.0,...,1128.0,594.0,7209.0,1967.0,62.0,5875.0,3783.0,1302.0,10960.0,2015
1,10001,DE,Kent County,413.0,284.0,529.0,530.0,771.0,872.0,18.0,...,107.0,528.0,2136.0,629.0,17.0,1853.0,655.0,909.0,3417.0,2015
2,10003,DE,New Castle County,1964.0,1275.0,2487.0,2333.0,3288.0,3494.0,126.0,...,679.0,2406.0,8835.0,2943.0,104.0,7453.0,3340.0,4174.0,14967.0,2015
3,10005,DE,Sussex County,794.0,436.0,805.0,929.0,1453.0,2204.0,30.0,...,257.0,1144.0,4198.0,1032.0,20.0,3408.0,1531.0,1712.0,6651.0,2015
4,12001,FL,Alachua County,726.0,2161.0,2734.0,1867.0,2250.0,2940.0,41.0,...,1188.0,429.0,9169.0,1848.0,85.0,6443.0,4378.0,1898.0,12719.0,2015


In [21]:
# Create our X and y for the model with individual y's for each metallic tier.

# Our X Dropping the Metallic Tier Targets, String Data, and Categorical Year
X = df.drop(['cfc','state','county','platinum','gold','silver','bronze','catastrophic', 'Year'], axis=1).values

# Individual y's for Each Target Data
ycat = df.catastrophic.values
ybro = df.bronze.values
ysil = df.silver.values
ygold = df.gold.values
yplat = df.platinum.values

# Create List of Target Values and Names of Target Values
ys=[ycat, ybro, ysil, ygold, yplat]

In [22]:
# Catastropic Tier Unscaled
X_train, X_test, y_train, y_test = train_test_split(X,ycat, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7691338582677165
Catastrophic Test Accuracy: 0.6252755326965467


In [23]:
# Catastropic Tier Scaled
X_train, X_test, y_train, y_test = train_test_split(X,ycat, test_size=.3, random_state=42)
n_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7105511811023622
Catastrophic Test Accuracy: 0.6539309331373989


Not using scaled data produces weaker results.  Will not investigate using this data for our model further.

***

Overall the best model used the Targets by Percent of `Total Plan Selections (TPS)` dataset with the `Newton-CG` solver.  Since a Linear Regression model can be built for Bronze, Silver, Gold plans, the Logistic Regression model should be built for the Platinum and Catastrophic plans (as it is best suited for those two plans.
<br>
Fine tuning our current models may help.  All features seem to play a role in helping to provide better predictions.  Using LogisticRegressionCV might assist in tuning each model with the best hyperparameters.

# Fine Tuning Models

In [5]:
# Import Logistic Regression CV
from sklearn.linear_model import LogisticRegressionCV

In [6]:
# Instantiate new Normalizer Pipeline
n_steps_norm = [('scaler',Normalizer()), ('log',n_log)]
n_pipe_norm = Pipeline(n_steps_norm)

In [7]:
#  Instantiate New Logistic Regression Models and Pipelines

# Instantiate Regressor, Steps, and Pipeline with Newtpm-CG Solver
n_log_cv = LogisticRegressionCV(multi_class='multinomial', solver='newton-cg', random_state=42,
                                max_iter=400,n_jobs = 4)
n_steps_cv = [('scaler',StandardScaler()), ('log',n_log_cv)]
n_pipe_cv = Pipeline(n_steps_cv)

# Instantiate Regressor, Steps, and Pipeline with SAGA Solver
saga_log_cv = LogisticRegressionCV(multi_class='multinomial', solver='saga', random_state=42,
                                   max_iter=300,n_jobs = 4)
saga_steps_cv = [('scaler',StandardScaler()), ('log', saga_log_cv)]
saga_pipe_cv = Pipeline(saga_steps_cv)


## Newton - CG Solver

In [9]:
# Catastropic Tier Unscaled
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7676544766708702
Catastrophic Test Accuracy: 0.7507352941176471


In [10]:
# Catastropic Tier Scaled with Standard Scaler
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7528373266078184
Catastrophic Test Accuracy: 0.7419117647058824


In [14]:
# Catastropic Tier Scaled with Normalizer
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_pipe_norm.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe_norm.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe_norm.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7005044136191677
Catastrophic Test Accuracy: 0.7058823529411765


In [11]:
# Platinum Tier Unscaled
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.3, random_state=42)
n_log.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log.predict(X_train))
test_acc = accuracy_score(y_test, n_log.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.8307061790668349
Platinum Test Accuracy: 0.8080882352941177


In [12]:
# Platinum Tier - Scaled with Standard Scaler
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.3, random_state=42)
n_pipe.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.8256620428751577
Platinum Test Accuracy: 0.8227941176470588


In [15]:
# Platinum Tier Scaled with Normalizer
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.3, random_state=42)
n_pipe_norm.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe_norm.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe_norm.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.812421185372005
Platinum Test Accuracy: 0.8176470588235294


## Logistic Regression CV (Newton-CG and SAGA solvers)


In [None]:
# Silver Tier Unscaled With CV
X_train, X_test, y_train, y_test = train_test_split(Xp,ysilp, test_size=.3, random_state=42)
n_log_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log_cv.predict(X_train))
test_acc = accuracy_score(y_test, n_log_cv.predict(X_test))
print('Silver Training Accuracy: {}'.format(train_acc))
print('Silver Test Accuracy: {}'.format(test_acc))



The weakness in my current models is likely due to the high number of target classifications within the data.  Let's look at the number of unique targets in each tier to test this hypothesis.

In [12]:
test = dfp[['catastrophic','bronze','silver','gold','platinum']]

As seen above, the differences in strengths of models is clearly correlated with the number of unique targets in each tier.  If we were to round each target to significantly reduce the number of unique targets in the data, the models should improve dramatically.

In [13]:
test.nunique()

catastrophic     6
bronze          57
silver          67
gold            44
platinum        16
dtype: int64

Strong Linear Regression models were built for Silver, Bronze and Gold tiers.  Logistic Regression seems to be working well for the remaining Catastrophic and Platinum tiers.  Next step will be to bin the tiers for Catastrophic and Platinum plans so that we can use Logistic Regression CV.

In [10]:
np.unique(ycatp, return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([3176,  918,  363,   60,    9,    6], dtype=int64))

In [8]:
np.unique(yplatp, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
 array([3689,  277,  197,  153,   66,   42,   25,   26,   13,   14,   10,
           7,    5,    3,    3,    2], dtype=int64))

For Logistic Regression CV to work, we need to have at least 3 values for each target value.  That isn't a problem for the Catastrophic tier.  However, it is for the Platinum tier.  Since 15% is the highest amount, reducing it to 14% is acceptable.  This is because a company would want to advertise platinum plans in counties containing 13% or more anyway.

In [8]:
# Convert 15% to 14% in the platinum targets
for n in range(len(yplatp)):
    if yplatp[n] == 15:
        yplatp[n]=14
np.unique(yplatp, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([3689,  277,  197,  153,   66,   42,   25,   26,   13,   14,   10,
           7,    5,    3,    5], dtype=int64))

In [9]:
np.unique(yplatp, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([3689,  277,  197,  153,   66,   42,   25,   26,   13,   14,   10,
           7,    5,    3,    5], dtype=int64))

In [20]:
# Catastrophic Tier Unscaled With CV - Newton - CG
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_log_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log_cv.predict(X_train))
test_acc = accuracy_score(y_test, n_log_cv.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7676544766708702
Catastrophic Test Accuracy: 0.7514705882352941




In [27]:
# Catastrophic Tier Scaled With CV Newton - CG
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
n_pipe_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe_cv.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe_cv.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))

Catastrophic Training Accuracy: 0.7673392181588903
Catastrophic Test Accuracy: 0.7522058823529412


In [22]:
# Catastrophic Tier Scaled With CV SAGA
X_train, X_test, y_train, y_test = train_test_split(Xp,ycatp, test_size=.3, random_state=42)
saga_pipe_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe_cv.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe_cv.predict(X_test))
print('Catastrophic Training Accuracy: {}'.format(train_acc))
print('Catastrophic Test Accuracy: {}'.format(test_acc))



Catastrophic Training Accuracy: 0.7440100882723834
Catastrophic Test Accuracy: 0.7455882352941177


In [10]:
# Platinum Tier Unscaled With CV - Newton - CG
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.2, random_state=42)
n_log_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_log_cv.predict(X_train))
test_acc = accuracy_score(y_test, n_log_cv.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.8297931034482758
Platinum Test Accuracy: 0.8324145534729879




In [11]:
# Platinum Tier Scaled With CV - Newton - CG
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.2, random_state=42)
n_pipe_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, n_pipe_cv.predict(X_train))
test_acc = accuracy_score(y_test, n_pipe_cv.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))

Platinum Training Accuracy: 0.8297931034482758
Platinum Test Accuracy: 0.8335170893054025


In [14]:
# Platinum Tier Scaled With CV - SAGA
X_train, X_test, y_train, y_test = train_test_split(Xp,yplatp, test_size=.2, random_state=42)
saga_pipe_cv.fit(X_train, y_train)
train_acc = accuracy_score(y_train, saga_pipe_cv.predict(X_train))
test_acc = accuracy_score(y_test, saga_pipe_cv.predict(X_test))
print('Platinum Training Accuracy: {}'.format(train_acc))
print('Platinum Test Accuracy: {}'.format(test_acc))




Platinum Training Accuracy: 0.8146206896551724
Platinum Test Accuracy: 0.834619625137817


The best model seems to be LogisticRegressionCV scaled using the Newton-CG Solver for both Catastrophic and Platinum tiers.  