# Health Prediction

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import Linear Regression machine learning library
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

from utils import DFImputer, DummyTransformer, DFLabelEncoder, DateFormatter, DateTransformer

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error
)
from IPython.display import Image
from subprocess import call

import os
import seaborn as sns
sns.set(rc={'figure.figsize':(13,6)})



## Load data

In [2]:
journey9 = pd.read_hdf('journey9.hdf', 'journey9')
journey9.tail().T

Unnamed: 0,11955160,11955161,11955162,11955163,11955164
expiry_principal_subscription_id,999342700,999533724,999582292,999640265,999746566
subscription_expiry_month,2020-01-31,2020-10-31,2020-10-31,2020-01-31,2020-11-30
subscription_registration_month,2017-01-31,2016-10-31,2016-10-31,2016-08-31,2016-11-30
subscription_expirying_in_month,3,3,3,3,3
Journey,9,9,9,9,9
months_since_registration,36,48,48,41,48
subscription_expiry_transition_type,Renew,Trial,Renew,Renew,Renew
subscription_expiry_length,1 Year(s),1 Year(s),1 Year(s),1 Year(s),1 Year(s)
subscription_expiry_affiliate_id,885,1274,1067,1264,1067
subscription_expiry_affiliate_name,ASUS,Retail - General,Hewlett Packard Consumer,Office Depot - TechDepot Services,Hewlett Packard Consumer


## Basic Stats

In [3]:
journey9.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11955165 entries, 0 to 11955164
Data columns (total 40 columns):
 #   Column                                       Dtype  
---  ------                                       -----  
 0   expiry_principal_subscription_id             int64  
 1   subscription_expiry_month                    object 
 2   subscription_registration_month              object 
 3   subscription_expirying_in_month              float64
 4   Journey                                      float64
 5   months_since_registration                    int64  
 6   subscription_expiry_transition_type          object 
 7   subscription_expiry_length                   object 
 8   subscription_expiry_affiliate_id             int64  
 9   subscription_expiry_affiliate_name           object 
 10  subscription_expiry_channel                  object 
 11  subscription_expiry_transaction_source       object 
 12  subscription_expiry_package_id               int64  
 13  subscripti

In [4]:
train = journey9[journey9["subscription_expiry_month"] == "2020-10-31"].sample(100000)
test = journey9[journey9["subscription_expiry_month"] == "2020-11-30"].sample(100000)

In [5]:
train.shape, test.shape

((100000, 40), (100000, 40))

### Value Counts

In [6]:
for col in train:
    vc = train[col].value_counts(normalize = True)
    l = len(vc)
    print(f"\n{'*'* 20}\n{col}. {l} unique values\n{'*' *20}")
    if l > 10:
        print(f"Showing top 10 items\n{'-'* 20}")
        print(vc.iloc[:10])
    else:
        print(vc)


********************
expiry_principal_subscription_id. 98919 unique values
********************
Showing top 10 items
--------------------
1640717545    0.00230
1264371254    0.00016
1303667786    0.00008
1252881270    0.00007
1640061315    0.00007
1494114129    0.00007
1175732690    0.00006
519211282     0.00006
1476770956    0.00006
1491666010    0.00006
Name: expiry_principal_subscription_id, dtype: float64

********************
subscription_expiry_month. 1 unique values
********************
2020-10-31    1.0
Name: subscription_expiry_month, dtype: float64

********************
subscription_registration_month. 125 unique values
********************
Showing top 10 items
--------------------
2019-10-31    0.35346
2018-10-31    0.15333
2017-10-31    0.08412
2016-10-31    0.06786
2015-10-31    0.04633
2014-10-31    0.03012
2019-09-30    0.02798
2018-09-30    0.02761
2018-08-31    0.02670
2019-01-31    0.02655
Name: subscription_registration_month, dtype: float64

********************
su

NOTES: Based on above stats^^
- expiry_principal_subscription_id - ID Column
- subscription_expiry_month - Month level Date Feature. Only 1 value. Retain for future purposeNot Needed now
- subscription_registration_month (160 unique values)- Month level Date Feature
- subscription_expirying_in_month - Complement of 'Journey' ==> remove
- months_since_registration (will correlate with subscription_registration_month since subscription_expiry_month is the same) 
- subscription_expiry_length - Only 1 value. Retain for future purposes
- subscription_expiry_affiliate_id. 244 unique values 
- subscription_expiry_affiliate_name - Remove. Redundant w/ aff id
- subscription_expiry_package_name - Remove. Redundant w/ pkg id
- opt_out_date - Remove. 
- opt_in_date - Remove
________________________
- 0_month_renewal - target variable
- 1_month_renewal - Ancillary target varibale. Remove for now


## Preprocessing

In [7]:
ID_COLS = ["expiry_principal_subscription_id"]
TARGET_COL = "0_month_renewal"
REM_COLS = ["1_month_renewal",
            "opt_out_date",
            "opt_in_date",
            "subscription_expirying_in_month",
            "subscription_expiry_affiliate_name",
            "subscription_expiry_package_name",
            "subscription_expirying_in_month",
            ]

MONTH_DATE_FEATURES = [
    "subscription_registration_month", "subscription_expiry_month"]

NUM_COLS = [
    NUM_COL
    for NUM_COL in train.head().select_dtypes('number').columns
    if NUM_COL not in REM_COLS + ID_COLS + MONTH_DATE_FEATURES + [TARGET_COL]
]
OBJ_COLS = [
    OBJ_COL for OBJ_COL in train.head().select_dtypes('object').columns
    if OBJ_COL not in REM_COLS + ID_COLS + MONTH_DATE_FEATURES + [TARGET_COL]
]

In [8]:
print(f"\nNumerical Columns:\n{NUM_COLS}")
print(f"\nObject Columns:\n{OBJ_COLS}")
print(f"\n Date Columns:\n{MONTH_DATE_FEATURES}")


Numerical Columns:
['Journey', 'months_since_registration', 'subscription_expiry_affiliate_id', 'subscription_expiry_package_id', 'ar_on_rundate', 'All_device_count', 'PC_device_count', 'Mobile_device_count', 'cui_actions', 'av_users', 'app_launch_users', 'appboost_users', 'vpn_users', 'shredder_users', 'quickclean_users', 'wbStatus', 'waStatus', 'non_av_users', 'no_of_features']

Object Columns:
['subscription_expiry_transition_type', 'subscription_expiry_length', 'subscription_expiry_channel', 'subscription_expiry_transaction_source', 'pkg_license_cnt', 'subscription_expiry_user_click_source_group', 'subscription_expiring_country', 'subscription_expiring_geo', 'Opt_in_out_status', 'Lost_type', 'renewal_Package_group']

 Date Columns:
['subscription_registration_month', 'subscription_expiry_month']


In [9]:
X_train, y_train = train.drop(columns=TARGET_COL), train[TARGET_COL]
X_test, y_test = test.drop(columns=TARGET_COL), test[TARGET_COL]

### Feature Pipelines

In [10]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.impute import SimpleImputer


In [11]:
num_processor = Pipeline([
    ('mean_imputer', DFImputer(strategy='constant', fill_value=0)),
])

obj_processor = Pipeline(
    [
        ('str_imputer', DFImputer(strategy="most_frequent", missing_values= None)),
        ('dummify', DFLabelEncoder())
    ]
)

date_processor = Pipeline(
    [
        ('date_convert', DateFormatter()),
        ('date_add_field', DateTransformer())
    ]
)

preprocess_pipeline = ColumnTransformer(
    [
        ('date_proc', date_processor, MONTH_DATE_FEATURES),
        ('num_proc', num_processor, NUM_COLS),
        ('obj_proc', obj_processor, OBJ_COLS)
    ],
    remainder='drop'
)


Hey there! Today I'm gonna impute. My strategy is gonna be constant and my fillvalue is 0
Hey there! Today I'm gonna impute. My strategy is gonna be most_frequent and my fillvalue is nothing


In [12]:
obj_processor.fit_transform(train[OBJ_COLS].iloc[:1]).tail().T

ValueError: Input contains NaN

# Models

## Baseline Model

In [None]:
model_pipe = Pipeline(steps=[
    ('preprocess', preprocess_pipeline),
    ('model', RandomForestClassifier(n_jobs = -1))
]).fit(X_train, y_train);