<a href="https://colab.research.google.com/github/johnherr/rapids/blob/master/RAPIDS_8_21_Denver_Meetup_tutorial_Black_Friday.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAPIDS 8/21 Denver Meetup @ Galvanize

In [0]:
!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
!bash rapids-colab.sh
!wget https://datahack-prod.s3.amazonaws.com/train_zip/train_oSwQCTC.zip -O train.zip
import sys, os

sys.path.append('/usr/local/lib/python3.6/site-packages/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

In [0]:
!ls
!unzip train.zip

In [0]:
import cuml
import cudf
import nvcategory

import xgboost as xgb
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error, roc_auc_score

In [0]:
#Read in the data. Notice how it decompresses as it reads the data into memory. 
gdf = cudf.read_csv('train.csv') #assumes that you put the zip file in the root folder

In [0]:
#Taking a look at the data. We use "to_pandas()" to get the pretty printing. 
gdf.head().to_pandas()

In [0]:
#Exercise: Let's do some descriptive statistics 

In [0]:
#Hint: try some of the function you may know from Pandas like DataFrame.Series.max() or look up the documentation here:

In [0]:
#grabbing the first character of the years in city string to get rid of plus sign, and converting to int
gdf['city_years'] = gdf.Stay_In_Current_City_Years.str.get(0)

In [0]:
#Here we can see how we can control what the value of our dummies with the replace method and turn strings to ints
gdf['City_Category'] = gdf.City_Category.str.replace('A', '1')
gdf['City_Category'] = gdf.City_Category.str.replace('B', '2')
gdf['City_Category'] = gdf.City_Category.str.replace('C', '3')
gdf['City_Category'] = gdf['City_Category'].str.stoi()

In [0]:
#EXERCISE: replace city in the same way as City Category

In [0]:
#Hint: the Gender column only has values 'M' and 'F'

In [0]:
#Solution
gdf['Gender'] = gdf.Gender.str.replace('F', '1')
gdf['Gender'] = gdf.Gender.str.replace('M', '0')
gdf['Gender'] = gdf.Gender.str.stoi()

In [0]:
#Let's take a look at how many products we have
prod_count = cudf.Series(nvcategory.from_strings(gdf.Product_ID.data).values()).unique().count() #hideous one-liner
print("Unique Products: {}".format(prod_count))

In [0]:
#Let's take a look at how many primary product categories we have
#We do it differently here because the variable is a number, not a string
prod1_count = gdf.Product_Category_1.unique().count()
print("Unique Product Categories: {}".format(prod1_count))

In [0]:
#Filling missing values
gdf['Product_Category_2'] = gdf['Product_Category_2'].fillna(0)

In [0]:
#EXERCISE: Make a variable that's 1 if the product is multi-category, 0 otherwise

In [0]:
#Hint: think about how to combine the Product Category 2 and Product Category 3

In [0]:
#Solution: 
gdf['Product_Category_3'] = gdf['Product_Category_3'].fillna(0)
gdf['multi'] = ((gdf['Product_Category_2'] + gdf['Product_Category_3'])>0).astype('int')

In [0]:
#EXERCISE: Create a Gender/Marital Status Interaction Effect

In [0]:
#Hint: bother Gender and Marital Status are 0/1

In [0]:
#Solution:
gdf['gen_mar_interaction'] = gdf['Gender']*gdf['Marital_Status']

In [0]:
#Because Occupation is a code, it should converted into indicator variables
gdf = gdf.one_hot_encoding('Occupation', 'occ_dummy', gdf.Occupation.unique())

In [0]:
#Dummy variable from Int
gdf = gdf.one_hot_encoding('City_Category', 'city_cat', gdf.City_Category.unique())

#Dummy from string
cat = nvcategory.from_strings(gdf.Age.data)
gdf['Age'] = cudf.Series(cat.values())
gdf = gdf.one_hot_encoding('Age', 'age', gdf.Age.unique())

#EXERCISE: Create dummy variables from Product Category 1

In [0]:
#Solution:
gdf = gdf.one_hot_encoding('Product_Category_1', 'product', gdf.Product_Category_1.unique())

In [0]:
#We're going to drop th variables we've transformed
drop_list = ['User_ID', 'Age', 'Stay_In_Current_City_Years', 'City_Category','Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
gdf = gdf.drop(drop_list)

In [0]:
#We're going to make a list of all the first indicator variables in a series now so it will be
#easier to exclude them when we're doing regressions later

In [0]:
dummy_list = ['occ_dummy_0', 'city_cat_1', 'age_0', 'product_1', 'Purchase']

In [0]:
#All variables currently have to have the same type for some methods in cuML
for col in gdf.columns.tolist():
    gdf[col] = gdf[col].astype('float32')

In [0]:
test_size = round(len(gdf)*0.2)
train_size = round(len(gdf)-test_size)

In [0]:
test = gdf.iloc[0:test_size]

In [0]:
#EXERCISE: Make the test set in a similar way

In [0]:
#Solution:
gdf_train = gdf.iloc[train_size:]

In [0]:
#Deleting the main gdf because we're going to be making other subsets and other stuff, so it will be nice to have the memory. 
del(gdf)

In [0]:
y_train = gdf_train['Purchase']
X_reg = gdf_train.drop(dummy_list)

In [0]:
# # I'm going to perform a hyperparameter search for alpha in a ridge regression
output = {}
for alpha in np.around(np.arange(0.1, 10, 0.1), decimals=2):
    
    Ridge = cuml.Ridge(alpha=alpha, fit_intercept=True)
    _fit = Ridge.fit(X_reg, y_train)
    _y_hat = _fit.predict(X_reg)
    _mse = sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
    output['MSE_RIDGE_{}'.format(alpha)] = _mse

print('MAX AUC: {}'.format(min(output, key=output.get)))

In [0]:
Ridge = cuml.Ridge(alpha=.1, fit_intercept=True)
_fit = Ridge.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

In [0]:
y_train.max()

In [0]:
y_xgb = gdf_train[['Purchase']]
X_xgb = gdf_train.drop('Purchase')
xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

In [0]:
xgb_params = {
    'nround':100,
    'max_depth':4,
    'max_leaves':2**4,
    'tree_method':'gpu_hist',
    'n_gpus':1,
    'loss':'ls',
    'objective':'reg:squarederror',
    'max_features':'auto',
    'criterion':'friedman_mse',
    'grow_policy':'lossguide',
    'verbose':True
}

In [0]:
xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)

In [0]:
y_hat_xgb = xgb_model.predict(xgb_train_set)

In [0]:
RMSE = np.sqrt(mean_squared_error(y_xgb['Purchase'].to_pandas(), y_hat_xgb)) #get out of sample RMSE too

In [0]:
print(RMSE)

In [0]:
#EXERCISE: Change XGB around to predict if someone is married based on the data we have

In [0]:
#Hint: in the xgb parameters, change the objective function to 'reg:logistic'

In [0]:
#Solution
y_xgb = gdf_train[['Marital_Status']]
X_xgb = gdf_train.drop('Marital_Status')
xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

xgb_params = {
    'nround':100,
    'max_depth':10,
    'max_leaves':2**4,
    'tree_method':'gpu_hist',
    'n_gpus':1,
    'loss':'ls',
    'objective':'reg:logistic',
    'criterion':'auc',
    'verbose':True
}

xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)
y_hat_xgb = xgb_model.predict(xgb_train_set)
AUC = roc_auc_score(y_xgb['Marital_Status'].to_pandas(), y_hat_xgb)
print(AUC)

In [0]:
#EXTRA EXERCISE: Apply kNN to the customers
#EXTRA EXERCISE: Apply PCA to data