# Basic Modeling

- This notebook aims to try a basic model based on data after feature engineering

- A True End-to-End ML Example: Lead Scoring | by Adam Barnhard | Towards Data Science
https://towardsdatascience.com/a-true-end-to-end-ml-example-lead-scoring-f5b52e9a3c80

In [43]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

In [23]:
data = pd.read_csv('../../../JanOct16_trimmed.csv', index_col = 0, nrows = 100000)
indiv = pd.read_csv('individuals.csv', index_col = 0)
indiv_valid = pd.read_csv('individuals_test.csv', index_col = 0)

In [24]:
pd.set_option('display.max_columns', 500)

In [25]:
data.head()

Unnamed: 0,Unnamed: 0.1,STORE_ID,TRAN_ID,DATE,ARTICLE_ID,INDIV_ID,VEHICLE_ID,UNITS,SALES,STATE_CODE,ZIP_CODE,MSA,MAKE,MODEL,SUB_MODEL,MODEL_YEAR,PROD_GROUP_CODE,PROD_GROUP_DESC,CATEGORY_CODE,CATEGORY_DESC,SEGMENT_CODE,SEGMENT_DESC,CLASS_CODE,CLASS_DESC,DISCOUNT_FLAG,CROSS_SECTION,ASPECT_RATIO,RIM_SIZE,EMAIL_OPTIN_IND,AH1_RES_BUS_INDC,SUPP1_BUS_PANDER
0,0,20869,991975080,2016-01-17,7013632,277902102.0,945291489,0.0,51.96,TN,37221,"NASHVILLE,",GMC,K1500 SUBURBAN SLE,SLE,1997.0,4.0,Services,62.0,Tire Services,91.0,Tire Services,147.0,Balance,N,,,,Y,R,N
1,1,20869,991975080,2016-01-17,7015016,277902102.0,945291489,0.0,0.0,TN,37221,"NASHVILLE,",GMC,K1500 SUBURBAN SLE,SLE,1997.0,4.0,Services,62.0,Tire Services,91.0,Tire Services,139.0,Tire Mounting,N,,,,Y,R,N
2,2,20869,991975080,2016-01-17,7097782,277902102.0,945291489,0.0,63.8,TN,37221,"NASHVILLE,",GMC,K1500 SUBURBAN SLE,SLE,1997.0,5.0,Tires,20158.0,Road Hazard,20159.0,Road Hazard,20160.0,Road Hazard,N,NONE,NONE,NONE,Y,R,N
3,3,20869,991975080,2016-01-17,7004578,277902102.0,945291489,0.0,84.99,TN,37221,"NASHVILLE,",GMC,K1500 SUBURBAN SLE,SLE,1997.0,4.0,Services,83.0,Steering/Suspension/Drivetrain,96.0,Alignments,49083.0,Alignment Service,N,,,,Y,R,N
4,4,20869,991975080,2016-01-17,2809,277902102.0,945291489,4.0,373.96,TN,37221,"NASHVILLE,",GMC,K1500 SUBURBAN SLE,SLE,1997.0,5.0,Tires,26.0,Passenger Tires,30.0,P Metric Light Truck Tires,42.0,All Terrain Tires,N,265,70,16,Y,R,N


In [26]:
indiv.shape, indiv_valid.shape

((49402, 14), (6650, 14))

## For the original dataframe 

~ we will divide features into numeric, categorical and response variables

In [27]:
# leads_categorical_columns = ['cust_in_top_10_sales', 'store_top_tran',
#        'store_top_tires_tran', 'vehicle_tire_svc', 'vhc_tire_purchase',
#        'vhc_early_tire_purchase', 'top_cust_overall', 'top_cust_tire_purch',
#        'indiv_make_tire_pch', 'make_prop_tire', 'MODEL_prop_tire',
#        'SUB_MODEL_prop_tire']

# consider adding those columns to the indiv dataset 
# leads_numeric_columns = ['SALES',
#                          'UNITS',
#                          'MODEL_YEAR']


leads_response_columns = ['label']

In [28]:
# df = pd.concat([data[leads_numeric_columns], data[leads_categorical_columns], data[leads_response_columns]], axis = 1)

## Splitting outcome and features

In [29]:
leads_x_train = indiv.drop(leads_response_columns, axis=1)

leads_y_train = indiv[leads_response_columns]

leads_x_valid = indiv_valid.drop(leads_response_columns, axis=1)

leads_y_valid = indiv_valid[leads_response_columns]

## scaling numerical columns

##### consider this after adding the numerical columns

In [30]:
# scaler = StandardScaler()
# scaler = scaler.fit(leads_x_train[leads_numeric_columns])

## pre modeling adjustments 

- create df with selected columns
- Use the fitted scaler to center and scale the numeric columns   -- TO BE DONE!
- Turn categorical variables into one-hot encoded variables
- Ensure that all columns from the training dataset are also in the outputted, processed dataset (This is important so that all levels of dummy variables are created, even if the dataset we import doesn’t have each individual level.)

In [31]:
# for later use 

# def pre_process_leads_data(df,
#                            numeric_columns,
#                            categorical_columns,
#                            fitted_scaler,
#                            train_df_columns = None):
#     ## create new df with selected columns
# #     df.columns = map(str.lower, df.columns)
#     _df = df[set(numeric_columns + categorical_columns)].copy()
    
#     ## scale the numeric columns with the pre-built scaler
#     _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])
         
#     # First, make categorical text lowercase
#     _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
#     # Next, create one-hot-encoded variables, add to dataframe, drop old columns
#     _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
#     _df = pd.concat([_df, _df_dummies], axis=1)
#     _df.drop(categorical_columns, axis=1, inplace = True)

#     if train_df_columns:
#         _df = _df.reindex(columns=train_df_columns, fill_value=0)

#     return _df

In [32]:
# leads_x_train_clean = pre_process_leads_data(df = leads_x_train,
# #                                              fitted_scaler = scaler
# #                                             numeric_columns = leads_numeric_columns,
#                                             categorical_columns = leads_categorical_columns
#                                             )

# leads_x_test_clean = pre_process_leads_data(df = leads_x_test,
#                                            numeric_columns = leads_numeric_columns,
#                                            categorical_columns = leads_categorical_columns,
#                                            fitted_scaler = scaler,
#                                            train_df_columns = leads_x_train_clean.columns.tolist())


## Model training 

In [33]:
## Train the random forest model
num_estimators = 5
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(leads_x_train.iloc[:, 1:], leads_y_train.values.ravel())

RandomForestClassifier(min_samples_split=4, n_estimators=5)

## Testing the model

will use it once I get my test data 

In [49]:
leads_y_valid_predicted = rf.predict(leads_x_valid.iloc[:, 1:])

accuracy = accuracy_score(leads_y_valid, leads_y_valid_predicted)
# auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted)
recall = recall_score(leads_y_valid, leads_y_valid_predicted)
# precision = precision_score(leads_y_valid, leads_y_valid_predicted)

print(accuracy, recall)

# print(auc_score)

0.9264661654135339 0.0


# we have to balance the dataset