In [1]:
##########################################################################
##### Template code and instructions on the basics of machine learning#
##########################################################################
### Step 1: install packages - 
##### packages are necessary to install and load, given that they have the built in functions necessary to run complex tasks. 
## They effectively act as one of the most crucial time saving activities that would otherwise lead to overly long and 
## duplicative scripts. 

### note: you are not expected to remember all of these; just for the best to copy and paste these sections 

## install pkgs 
import sys
!{sys.executable} -m pip install xgboost==1.7.5 # note: needed since it looks like anaconda installs an earlier version 
# of the package, which is not helpful. 1.7.5 allows for the categorical data of interest to be used. 

# !{sys.executable} -m pip install requests #; this code here can be used to install packages on anaconda/jupyter notebook 
### I believe the below should be installed by default 
import requests # web scraping 
from bs4 import BeautifulSoup # for web scraping 
import itertools # for efficient operation of loops 
import pandas as pd # necessary for reading in, creating, and manipulating data frames 
import csv ## for importing/exporting csvs 
import glob ## for finding files in path
import re
import numpy as np
### THe ML packages 
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# nltk packages; these are for the purpose of cleaning text, which will be crucial 
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode



In [2]:
#### Section 1: numeric machine learning 
# link: https://www.datacamp.com/tutorial/xgboost-in-python

diamonds = sns.load_dataset("diamonds") # load in the diamonds data set from the sns pkg 

diamonds.head() ## look at the top rows for the data set, now named diamonds 

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
### take a look at the rows and cols of the data set 
diamonds.shape # (53940, 10)

(53940, 10)

In [4]:
### get the summaries of the numeric variables 
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [5]:
### now clean the data frame 
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]
## what's going on above? The X, y are saying to create two objects, X and y. The part before the comma relates to X. The 
# portion after the comma refers to y, or the second object. 
## The "diamonds.drop('price', axis=1)" is creating a data frame with the .drop command, which drops the column titled "price"
# The second part after the comma is in turn just creating a data frame from just the column "price", which will now be 
# located in the object y 

In [6]:
# Extract text features
## first, create an object "cats", which is a subset of the data frame X. The exclude option using the np package (numpy)
# tells the command to drop all columns of the numeric variety. The .columns.tolist in turn is a command to turn into a list. 
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category; this makes it easier on memory 
for col in cats:
   X[col] = X[col].astype('category')
## note that this combines the list in cats with our previous object, X. We are saying that for the columns seen in the cats 
# object, whenever the column text in X matches with that of the column/list in cats, then it will change the variable in X 
# into a categorical one (i.e. factor) as opposed to a string var. 


In [7]:
## take a look at the types now 
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [8]:
# Split the data; follows the same syntax as box 6 via the commas 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1337)
## what this code is doing is creating train and test objects on the IVs and DV. The train_test_split does this in a manner 
# where the train and test sets by var type will match, and the random_state is the seed, which will ensure that we will get 
# the same randomized selection everytime that we run the code 

In [10]:
### where we import the xgboost pkg and run an ml algorithm 
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)
#dtrain_reg = xgb.DMatrix(X_train, y_train)

In [12]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"} ## will feed into model fit testing of the machine 
#learning 

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg, 
   num_boost_round=n,
)


### example model fit stats 

#mse = np.mean((actual - predicted) ** 2) ## mean squared error 
#rmse = np.sqrt(mse) # root mean squared error 

In [13]:
### let's evaluate the model 
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg) # storing the prediction from the test model into the preds object 
preds ## stores the predicted prices 

array([ 612.978  , 6777.0317 ,  793.95917, ..., 1389.201  ,  538.1163 ,
       1552.1204 ], dtype=float32)

In [15]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}") ##gets the root mean squared error. compares the models 
#RMSE of the base model: 543.203

RMSE of the base model: 537.665


In [16]:
y_test # the y var from the test model, the applied predictions from the training set 

Unnamed: 0,price
2384,561
18224,7397
36341,936
31026,749
7125,4173
...,...
16559,6617
8278,4381
41489,1232
42954,506


In [18]:
### create evals object
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")] ## note that this comes all the way back from block 10 
         
model = xgb.train(
   params=params,
   dtrain=dtrain_reg, 
   num_boost_round=n,
   evals=evals,
)
  # this now trains and validates the diff models; we should see that the constant testing and applications eventually see
    # the RMSE converge, indicating a decent model 

[0]	train-rmse:3971.96772	validation-rmse:3960.17185
[1]	train-rmse:2838.56612	validation-rmse:2828.86685
[2]	train-rmse:2054.95156	validation-rmse:2049.31424
[3]	train-rmse:1515.40611	validation-rmse:1514.02909
[4]	train-rmse:1153.60406	validation-rmse:1155.57009
[5]	train-rmse:913.97875	validation-rmse:922.84190
[6]	train-rmse:762.16906	validation-rmse:775.54397
[7]	train-rmse:668.07790	validation-rmse:684.71596
[8]	train-rmse:609.09360	validation-rmse:629.87792
[9]	train-rmse:574.19616	validation-rmse:598.06631
[10]	train-rmse:552.31608	validation-rmse:578.31728
[11]	train-rmse:537.87321	validation-rmse:565.67839
[12]	train-rmse:528.33436	validation-rmse:559.34358
[13]	train-rmse:520.74475	validation-rmse:554.74550
[14]	train-rmse:515.04677	validation-rmse:550.59145
[15]	train-rmse:511.36064	validation-rmse:548.68574
[16]	train-rmse:504.59900	validation-rmse:545.15130
[17]	train-rmse:501.49288	validation-rmse:542.99614
[18]	train-rmse:495.88358	validation-rmse:542.22918
[19]	train-r

In [19]:
##### Next step: learning how to divide data into k sets of training and tests. Use k-1 parts for training, and the final as a 
# test. This process is known as cross validation 
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5, # how the training sets should be split 
   early_stopping_rounds=20
)


In [20]:
results # take a look at the output; note that the rmses are not sig diff near the end 

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3973.801938,6.178437,3975.116199,24.759975
1,2840.612707,5.127063,2846.522997,17.843194
2,2055.085239,4.299456,2061.09558,14.777246
3,1516.266259,5.01651,1528.015849,14.213172
4,1153.500382,5.265913,1171.813627,14.509073
5,913.280525,4.866743,937.664571,15.691096
6,759.226974,6.034956,789.734256,16.221961
7,663.710519,5.496933,700.521848,18.020397
8,605.592507,6.255761,648.288943,19.340346
9,569.525451,5.841636,617.04726,20.969303


In [21]:
### look at best rmse 
best_rmse = results['test-rmse-mean'].min()

best_rmse # which is the last of course 

554.4655285341859

In [22]:
### xgboost classifications - refers to what the dv will be (i.e. model type). For general categories, these can take form of 
# binary:logistic  and multi:softprob ; note that the later is multinomial, not ordinal. 
## THe next code is going to be of the multinomial variety 
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']] # splits data into X and y, with the X being all variables but cut,
# with y being the cut data. 

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y) # which will then be read as factors and such 

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist() ## also present above 

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category') # this gets the code to be more efficient as factors 

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)
# this code splits again into the train and test sets, with the stuff in parentheses fed into the 4 objects before the "="
# stratify is a useful command, ensuring that there is a proportionate amount of the y in all of the data sets, i.e. no 
# data set is simply all 1s or what have you 

In [30]:
y['cut'].unique() # take a look at the types of categories; note that there are 5, and will need to be fed in below 

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [23]:
# Create classification matrices; these will be fed into the xgb code below, and are xgb objects 
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [31]:
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"], ## here we are telling to look at three metrics, which are 
    # multi-class log loss, area under the ROC curve, and multi-class classification error
) # note that the auc is the flase positive rate (x axis) plotted against the true positive rate (y-axis). Closer to 1, teh 
# better, with 0 being garbage 

In [33]:
results.keys() # keys() method returns a list of all the keys in a dictionary, and what can be analyzed. Let's take a look


Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [35]:
results['test-auc-mean'].max()# at 0.94, pretty good. We'll want to run similar tests for our data. 
# what we would do is essentially whats going on here, though we'd do something ordinal 
# https://analyticsindiamag.com/a-complete-tutorial-on-ordinal-regression-in-python/
# in the event that the code doesn't match up, we can essentially first predict if something is somewhat toxic at least, and 
# then from there, just predict if its fully toxic. This would simply be running the logit twice. Easy enough. 

0.9402233623451636

In [40]:
import xgboost as xgb

# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3,
                                  enable_categorical=True)
xgb_classifier.fit(X_train, y_train)

# Convert the model to a native API model; this is what we can do to export our model and test on other data sets 
model = xgb_classifier.get_booster() ## mow we should be able to apply this elsewhere. Let's see 

In [43]:
model

<xgboost.core.Booster at 0x1de49046d00>

In [41]:
### to apply, we should only need the X data, with y then being predicted. Let's try that .

x_predict_pre = diamonds.drop("cut", axis=1)

# Extract text features
cats = x_predict_pre.select_dtypes(exclude=np.number).columns.tolist() ## also present above 

# Convert to pd.Categorical
for col in cats:
   x_predict_pre[col] = x_predict_pre[col].astype('category') # this gets the code to be more efficient as factors 

In [45]:
## note, to apply we still need everything to be a matrix in the xgb manner. we should be able to call that 
x_predict_pre_mat = xgb.DMatrix(x_predict_pre, enable_categorical=True)

In [46]:
### now let's apply having cleaned it 
x_predict_post = model.predict(x_predict_pre_mat)
x_predict_post ## this worked!!! 

array([[8.2803692e-04, 4.5760651e-03, 7.4318254e-01, 2.2490121e-02,
        2.2892323e-01],
       [1.1407522e-03, 7.7159353e-02, 8.7511810e-03, 4.9219695e-01,
        4.2075172e-01],
       [1.6888994e-01, 6.8149847e-01, 1.4987147e-03, 1.3883917e-03,
        1.4672449e-01],
       ...,
       [3.9300360e-03, 2.8588120e-02, 1.6254615e-02, 4.6576676e-01,
        4.8546046e-01],
       [2.6861599e-03, 1.0294575e-02, 9.3398370e-02, 6.2612146e-01,
        2.6749945e-01],
       [1.3444772e-03, 4.9832766e-03, 8.4948701e-01, 5.9855450e-02,
        8.4329791e-02]], dtype=float32)

In [66]:
### now lets get this into a data frame 
col_names = ['Ideal_pr', 'Premium_pr', 'Very_Good_pr', 'Good_pr', 'Fair_pr'] 
col_names = 
x_predict_post_df = pd.DataFrame(x_predict_post, columns=col_names)
x_predict_post_df ## see that this gets us the probs that it is part of a given category 
## now from here, I'll simply want to rename based upon y, which are 
# ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair'] 

Unnamed: 0,Ideal_pr,Premium_pr,Very_Good_pr,Good_pr,Fair_pr
0,0.000828,0.004576,0.743183,0.022490,0.228923
1,0.001141,0.077159,0.008751,0.492197,0.420752
2,0.168890,0.681498,0.001499,0.001388,0.146724
3,0.001142,0.006635,0.068685,0.266934,0.656605
4,0.001270,0.680335,0.008914,0.002010,0.307471
...,...,...,...,...,...
53935,0.001990,0.007377,0.763904,0.087403,0.139326
53936,0.002902,0.231594,0.037354,0.003894,0.724256
53937,0.003930,0.028588,0.016255,0.465767,0.485460
53938,0.002686,0.010295,0.093398,0.626121,0.267499


In [57]:
type(x_predict_post_df)

pandas.core.frame.DataFrame

In [69]:
### another helpful link: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
## now bind 
full_df = pd.concat([diamonds, x_predict_post_df], axis=1)


In [70]:
full_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,Ideal_pr,Premium_pr,Very_Good_pr,Good_pr,Fair_pr
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.000828,0.004576,0.743183,0.022490,0.228923
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0.001141,0.077159,0.008751,0.492197,0.420752
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.168890,0.681498,0.001499,0.001388,0.146724
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,0.001142,0.006635,0.068685,0.266934,0.656605
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.001270,0.680335,0.008914,0.002010,0.307471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,0.001990,0.007377,0.763904,0.087403,0.139326
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,0.002902,0.231594,0.037354,0.003894,0.724256
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,0.003930,0.028588,0.016255,0.465767,0.485460
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,0.002686,0.010295,0.093398,0.626121,0.267499


In [None]:
### next step: try the predictions on text data 