# Supervised machine learning: Introduction and regularization 
## Binary classification with text data

# Imports

In [5]:
pip install plotnine

Collecting plotnine
  Using cached plotnine-0.14.5-py3-none-any.whl.metadata (9.3 kB)
Collecting mizani~=0.13.0 (from plotnine)
  Downloading mizani-0.13.5-py3-none-any.whl.metadata (4.8 kB)
Collecting statsmodels>=0.14.0 (from plotnine)
  Downloading statsmodels-0.14.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.6 (from statsmodels>=0.14.0->plotnine)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Using cached plotnine-0.14.5-py3-none-any.whl (1.3 MB)
Downloading mizani-0.13.5-py3-none-any.whl (127 kB)
Downloading statsmodels-0.14.4-cp311-cp311-macosx_11_0_arm64.whl (9.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy, statsmodels, mizani, plotnine
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [plotnine]━━[0m [32m3/4[0m

In [7]:
## load packages 
import pandas as pd
import re
import numpy as np
import plotnine
from plotnine import *
import pickle

## nltk imports
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random

pd.set_option('display.max_colwidth', None)

In [9]:
## function to process text
def processtext(one_str, stop_list):
    
    ## remove stopwords
    no_stop = [tok for tok in wordpunct_tokenize(one_str)
              if tok not in stop_list]
    
    
    processed_string = " ".join([porter.stem(i.lower()) 
                        for i in no_stop if 
                        i.lower().isalpha() and len(i) >=3])
    return(processed_string)

## function to create dtm
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

# Load data

Load labeled yelp data in `public_data` and run below code

**Note**: make sure to change your path if you need to; if you're having trouble loading the `pkl`, try running on jupyter hub since it may be a python versioning issue

In [12]:
# If you have trouble loading these data (kernel dies due to memory issues), try sampling down to 5000 or 1000 rows
yelp = pd.read_pickle("../public_data/yelp_forML.pkl") #.sample(n=1000)

In [14]:
## preprocess data to create dtm
porter = PorterStemmer()
list_stopwords = stopwords.words("english")

yelp['process_text'] = [processtext(one_review, stop_list = list_stopwords) 
                        for one_review in yelp['raw_text']]

yelp_dtm = create_dtm(yelp['process_text'], yelp[['metadata_label', 'metadata_rowid',
                                                 'process_text', 'raw_text']])

# 1. Split into features, labels, and split into training/hold out

## 1.1 Split into X (features or id metadata) and y (labels)

In [17]:
X = yelp_dtm[[col for col in yelp_dtm.columns if col not in ['metadata_label',
                                                            'index']]].copy()
y = yelp_dtm[['metadata_label']]

In [18]:
## checking dimensionality
X.shape
y.shape

assert X.shape[0] == y.shape[0]
assert y.shape[1] == 1


(15000, 23439)

(15000, 1)

## 1.2 using automatic function to create train-test split

In [21]:
### using built-in function  
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.2,
                                                   random_state = 221)

In [22]:
# This is a great function and use it as much as you can.


## 1.3 using more manual approach to create train-test split

In [26]:
### more manually: useful when we want more control
### over the ids (eg clustering or time ordering)
### or if we want to go back to matrix before preprocessing
nrows_train = round(X.shape[0]*0.8)
nrows_test = X.shape[0] - nrows_train
random.seed(221)
train_ids = random.sample(list(set(X['metadata_rowid'])), nrows_train)

def my_split(X, y, 
             train_ids, 
             id_col):
    
    ## get test ids
    test_ids = set(X[id_col]).difference(train_ids)
    
    ## split
    X_train_man = X[X[id_col].isin(train_ids)].copy()
    X_test_man = X[X[id_col].isin(test_ids)].copy()
    y_train_man = y[y.index.isin(train_ids)].iloc[:, 0].to_numpy()
    y_test_man = y[y.index.isin(test_ids)].iloc[:, 0].to_numpy()
    
    ## return
    return(X_train_man, X_test_man, y_train_man, y_test_man)

In [29]:
X_train_man, X_test_man, y_train_man, y_test_man = my_split(X, y,
                                                            train_ids, 
                                                            id_col = 'metadata_rowid')

# 2. Estimate models with hardcoded parameters: logistic regression with L1 regularization (Lasso)

## 2.1 Estimate model using training data

In [34]:
non_feat = ['metadata_rowid', 'raw_text', 'process_text']
logit_lasso = LogisticRegression(penalty = "l1",max_iter=100, 
             C = 0.01, solver='liblinear')
logit_lasso.fit(X_train_man[[col for col in X_train.columns if col not in 
                   non_feat]], y_train_man)

## 2.2 Generate predictions in test data

In [36]:
y_pred = logit_lasso.predict(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])
y_predprob = logit_lasso.predict_proba(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])

In [37]:
## print the results 
y_pred[0:5]
y_predprob[0:5]


array([0, 0, 1, 0, 0])

array([[0.64283623, 0.35716377],
       [0.69709149, 0.30290851],
       [0.06363428, 0.93636572],
       [0.72134464, 0.27865536],
       [0.50411755, 0.49588245]])

## 2.3 Clean up predictions and calculate error metrics

In [42]:
## make into a dataframe
y_pred_df = pd.DataFrame({'y_pred_binary': y_pred,
                         'y_pred_continuous': [one_prob[1] 
                                            for one_prob in y_predprob],
                         'y_true': y_test_man})
y_pred_df.sample(n = 10, random_state = 4484)


Unnamed: 0,y_pred_binary,y_pred_continuous,y_true
656,0,0.240211,0
825,1,0.698133,1
1641,1,0.790631,1
2249,1,0.535207,1
1080,0,0.448053,0
2490,0,0.432694,1
2857,1,0.679513,1
1160,1,0.681755,1
1061,1,0.519428,0
2255,0,0.192981,0


In [43]:
## precision as tp / tp+fp 
error_cond = [(y_pred_df['y_true'] == 1) & (y_pred_df['y_pred_binary'] == 1),
             (y_pred_df['y_true'] == 1) & (y_pred_df['y_pred_binary'] == 0),
              (y_pred_df['y_true'] == 0) & (y_pred_df['y_pred_binary'] == 0)]

error_codeto = ["TP", "FN", "TN"]

y_pred_df['error_cat'] = np.select(error_cond, error_codeto, default = "FP")
y_error = y_pred_df.error_cat.value_counts().reset_index().copy()
y_error.columns = ['cat', 'n']
y_error

### precision
print("Precision is:-----------")
y_error.loc[y_error.cat == "TP", 'n'].iloc[0]/(y_error.loc[y_error.cat == "TP", 'n'].iloc[0] +
                    y_error.loc[y_error.cat == "FP", 'n'].iloc[0])

### recall
print("Recall is:---------------")
y_error.loc[y_error.cat == "TP", 'n'].iloc[0]/(y_error.loc[y_error.cat == "TP", 'n'].iloc[0] +
                    y_error.loc[y_error.cat == "FN", 'n'].iloc[0])

Unnamed: 0,cat,n
0,TN,1320
1,TP,1103
2,FN,319
3,FP,258


Precision is:-----------


0.8104335047759

Recall is:---------------


0.7756680731364276

## 2.4 Interpret the model

In [47]:
## get top features
las_coef = pd.DataFrame({'coef': logit_lasso.coef_[0],
                         'feature_name': 
                        [col for col in X_train.columns if col not in non_feat]})
las_coef.sort_values(by = 'coef', ascending = False)


top_feat = las_coef.sort_values(by = 'coef', ascending = False)[0:10]
top_feat_list = top_feat.feature_name.to_list()

all_agg = [yelp_dtm.groupby(['metadata_label']).agg({one_feat: np.mean})
for one_feat in top_feat_list]
all_agg_df = pd.concat(all_agg, axis = 1)
all_agg_df

Unnamed: 0,coef,feature_name
5009,0.839786,delici
8170,0.780814,great
11104,0.614079,love
526,0.445019,amaz
6510,0.399745,excel
...,...,...
20768,-0.303375,terribl
21101,-0.309563,told
14032,-0.362114,noth
1251,-0.381776,bad




Unnamed: 0_level_0,delici,great,love,amaz,excel,best,favorit,friendli,definit,alway
metadata_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.026424,0.171178,0.085976,0.022944,0.018819,0.103506,0.027713,0.067672,0.059036,0.113302
1,0.162248,0.557305,0.284452,0.104529,0.093621,0.219691,0.116819,0.169014,0.14264,0.242613


# 3. Compare performance across hyperparameters for logistic regression

Would our logit model make more accurate predictions if we fed it different hyperpameters? Which hyperparameters would be the best? Let's find out.

1. Define a function that:
- takes in a cost parameter (*C*, the inverse of regularization strength)
- trains a logistic regression model with L1 regularization (Lasso) and otherwise has the same parameters as above
- fits the model on the training data
- makes predictions and returns them as a DataFrame

2. Use the function to get predictions for the list of *C* parameters below, then bind them into one DataFrame.

3. Finally, score the precision for each model (each iteration of *C*) and show which model scores the best.

**Hint**: To compute precision score, you can use:
```python
precision_score(
    one_df['y_true'], one_df['y_pred'],
    zero_division = 0) # silences warning
```

In [50]:
# Provided set of hyperparameters on which to train and then compare performance
c_list = np.linspace(4, 0.0001, 5)

In [85]:
# your function code here
def train(c):
    logit_lasso = LogisticRegression(penalty = "l1",max_iter=100, 
             C = c, solver='liblinear')
    logit_lasso.fit(X_train_man[[col for col in X_train.columns if col not in 
                   non_feat]], y_train_man)
    y_pred = logit_lasso.predict(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])
    y_predprob = logit_lasso.predict_proba(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])
    y_pred_df = pd.DataFrame({'y_pred_binary': y_pred,
                         'y_pred_continuous': [one_prob[1] 
                                            for one_prob in y_predprob],
                         'y_true': y_test_man})
    y_pred_df.sample(n = 10, random_state = 4484)
    y_pred_df['C'] = c
    return y_pred_df

results = []

for i in range(len(c_list)):
    result_df = train(c_list[i])
    results.append(result_df)
    print("Finished running for: ", c_list[i])

df = pd.concat(results, ignore_index=True)



    

Finished running for:  4.0
Finished running for:  3.000025
Finished running for:  2.00005
Finished running for:  1.0000750000000003
Finished running for:  0.0001


In [86]:
df

Unnamed: 0,y_pred_binary,y_pred_continuous,y_true,C
0,0,0.000757,0,4.0000
1,0,0.046177,0,4.0000
2,1,1.000000,1,4.0000
3,0,0.000006,0,4.0000
4,0,0.429965,0,4.0000
...,...,...,...,...
14995,0,0.500000,0,0.0001
14996,0,0.500000,0,0.0001
14997,0,0.500000,0,0.0001
14998,0,0.500000,0,0.0001


In [112]:
# your code here to evaluate precision of each model
from sklearn.metrics import precision_score

precision_per_c = df.groupby('C').apply(
    lambda group: precision_score(group['y_true'], group['y_pred_binary'], zero_division=0)
)

print(precision_per_c)

C
0.000100    0.000000
1.000075    0.863136
2.000050    0.861687
3.000025    0.860977
4.000000    0.860594
dtype: float64




# 4. Activity 

- Read the documentation here to initialize a ridge regression (l2 penalty)- you can use the same cost parameter (C) and number of iterations as in the lasso example above: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- Fit the model on X_train_man, y_train_main 
- Generate binary and continuous predictions
- Create a function that takes in a dataframe of binary predictions and true labels and manually calculates the $F_{1}$ score:

$$F_{1} = 2 * \dfrac{precision * recall}{precision + recall} = \dfrac{TP}{TP + 0.5(FP + FN)}$$

- Apply that function to calculate the F1 score for the decision tree and lasso (from above), and ridge regression (from the activity)
- *Challenge exercise*: parametrize the model fitting with a function that takes in a classifier as an argument and returns coefficients or feature importances and certain eval metrics (eg precision, recall, and F1)

In [108]:
# your code here 
ridge_lasso = LogisticRegression(penalty = "l2",max_iter=100, 
             C = 0.01, solver='liblinear')
ridge_lasso.fit(X_train_man[[col for col in X_train.columns if col not in 
                   non_feat]], y_train_man)


In [110]:
y_pred = ridge_lasso.predict(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])
y_predprob = ridge_lasso.predict_proba(X_test_man[[col for col 
                in X_test_man.columns if col not in non_feat]])

In [126]:
y_pred_binary = ridge_lasso.predict(X_test_man[[col for col in X_test.columns if col not in non_feat]])
y_pred_binary2 = logit_lasso.predict(X_test_man[[col for col in X_test.columns if col not in non_feat]])
    

In [127]:
from sklearn.metrics import f1_score

f1_logistic = f1_score(y_test_man, y_pred_binary)
f1_ridge = f1_score(y_test_man, y_pred_binary2)
print("F1 Score Logistic:", f1_logistic)
print("F1 Score Ridge:", f1_ridge)


F1 Score Logistic: 0.8706866504008365
F1 Score Ridge: 0.7926697808120733


# Manual Version

In [133]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test_man, y_pred_binary, zero_division=0)
recall = recall_score(y_test_man, y_pred_binary, zero_division=0)
f1_manual = 2 * (precision * recall) / (precision + recall)

print("Manual F1 Score:", f1_manual)


Manual F1 Score: 0.8706866504008364


In [None]:
from sklearn.metrics import precision_score, recall_score

def manual_f1(y_pred_binary):
    precision = precision_score(y_test_man, y_pred_binary, zero_division=0)
    recall = recall_score(y_test_man, y_pred_binary, zero_division=0)
    f1_manual = 2 * (precision * recall) / (precision + recall)
    return f1_manual
    

In [None]:
print("F1 Score Logistic:", manual_f1(y_pred_binary))
print("F1 Score Logistic:", manual_f1(y_pred_binary2))