# 0. Example Usage

In [1]:
"""Constructiveness analysis on StreetCrowd: message-level features.

This is a reimplementation of the features from our paper,

Conversational markers of constructive discussions. Vlad Niculae and
Cristian Danescu-Niculescu-Mizil. In: Proc. of NAACL 2016.
https://vene.ro/constructive/

See the `test` function for a usage example.
"""

# Author: Vlad Niculae <vlad@vene.ro>
# License: Simplified BSD
import nltk
import re
import os
from collections import defaultdict

from stopwords import stopwords as mallet_stopwords


class Lexicon(object):
    """Word matching code for lexicons.

    Since lexicons may contain multi-word phrases ("I agree") and lexicons may
    overlap, we don't tokenize, use string matching instead.
    """
    def __init__(self, wordlists):
        self.wordlists = wordlists
        self.regex = {cat: self.wordlist_to_re(wordlist)
                      for cat, wordlist in wordlists.items()}

    def wordlist_to_re(self, wordlist):
        return re.compile(r'\b(?:{})\b'.format("|".join(wordlist).lower()))

    def count_words(self, text, return_match=False):
        """Returns a dict {category_name: sum 1[w in category]}

        Words are double-counted if they occur in more
        than one lexicon.
        """
        text_ = text.lower()
        match = {cat: reg.findall(text_) for cat, reg in self.regex.items()}
        count = {cat: len(m) for cat, m in match.items()}

        if return_match:
            return count, match
        else:
            return count

lexicons = {
    'pron_me': ['i', "i'd", "i'll", "i'm", "i've", 'id', 'im', 'ive',
                'me', 'mine', 'my', 'myself'],
    'pron_we': ["let's", 'lets', 'our', 'ours', 'ourselves', 'us',
                'we', "we'd", "we'll", "we're", "we've", 'weve'],
    'pron_you': ["y'all", 'yall', 'you', "you'd", "you'll", "you're",
                 "you've", 'youd', 'youll', 'your', 'youre', 'yours',
                 'youve'],
    'pron_3rd': ['he', "he'd", "he's", 'hed', 'her', 'hers', 'herself',
                 'hes', 'him', 'himself', 'his', 'she', "she'd",
                 "she'll", "she's", 'shes', 'their', 'them', 'themselves',
                 'they', "they'd", "they'll", "they've", 'theyd', 'theyll',
                 'theyve', "they're", "theyre"]
}

with open(os.path.join('lexicons', 'my_geo.txt')) as f:
    lexicons['geo'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_meta.txt')) as f:
    lexicons['meta'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_certain.txt')) as f:
    lexicons['certain'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_hedges.txt')) as f:
    lexicons['hedge'] = [line.strip().lower() for line in f]


lex_matcher = Lexicon(lexicons)


def get_content_tagged(words, tags):
    """Return content words based on tag"""
    return [w for w, tag in zip(words.lower().split(), tags.split())
            if tag in ("N", "^", "S", "Z", "A", "T", "V")]


def message_features(reasons, stopwords=mallet_stopwords):
    """Compute message-level features from a chat.

    Parameters
    ----------
    reasons, iterable:
        sequence of tuples (user, tokens, tags).
        In StreetCrowd, users individually can leave an explanation for their
        solo game decision.  These reasons populate the chat when the team
        meets, but they have no intrinsic order, so they can only introduce
        but not adopt idea words.  Also, more than one reason can introduce
        an idea, because they are written independently. 

    """
    seen_words = set()
    introduced = defaultdict(set)
    where_introduced = defaultdict(list)
    repeated = defaultdict(set)

    reason_features = []

    for k, (user, tokens, tags) in enumerate(reasons):
        features = {}
        content_words = [w for w in get_content_tagged(tokens, tags)
                         if w not in stopwords]

        # all new content words are new ideas here
        introduced[user].update(content_words)
        seen_words.update(content_words)
        for w in content_words:
            where_introduced[w].append(('reason', k))

        # length statistics
        features['n_words'] = len(tokens.split())
        lex_counts = lex_matcher.count_words(tokens)
        features.update(lex_counts)

        # fillers
        features['n_introduced'] = len(content_words)  # Modified for single utterance
        features['n_introduced_w_certain'] = features['n_introduced'] * features['certain']
        features['n_introduced_w_hedge'] = features['n_introduced'] * features['hedge']

        reason_features.append(features)

    return reason_features 



In [2]:
from nltk.corpus import stopwords
# Example usage 
my_utterance = "This is a sample sentence to analyze. i am sure s."
pos_tags = nltk.pos_tag(my_utterance.split())
tokens = my_utterance
tags = " ".join([tag for (_, tag) in pos_tags])

test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
reason_feat = message_features(test_reasons, stopwords) 
print(reason_feat)  # You might not need msg_feat

[{'n_words': 11, 'pron_me': 1, 'pron_we': 0, 'pron_you': 0, 'pron_3rd': 0, 'geo': 0, 'meta': 0, 'certain': 1, 'hedge': 0, 'n_introduced': 0, 'n_introduced_w_certain': 0, 'n_introduced_w_hedge': 0}]


# 1. Deriving collaboration markers

In [7]:
import collections
from nltk.corpus import stopwords
import json
import pandas as pd
import numpy as np
from collections import Counter

def load_data_afd():
    # Load the data from the JSON file
    with open('../../afd_1000_randomised_dialogues.json', 'r') as json_file:
        data_dict = json.load(json_file)

    # Extract the conversations, utterances, and labels from the data dictionary
    conversations = data_dict['conversations']
    utterances = data_dict['utterances']
    labels = data_dict['labels']

    return conversations, utterances, labels


def deriving_collaboration_markers(utterance):
    pos_tags = nltk.pos_tag(utterance.split())
    tokens = utterance
    tags = " ".join([tag for (_, tag) in pos_tags])

    test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
    reason_feat = message_features(test_reasons, stopwords) 
    return reason_feat

conversations, utterances, labels = load_data_afd()

In [8]:
annotations_utts = []
for i, conv in enumerate(utterances):
    print(i)
    annotations_ci = []
    
    for utterance in conv.split('\n\n'):
        markers = list(deriving_collaboration_markers(utterance)[0].values())
        annotations_ci.append(markers)

    annotations_utts.append(annotations_ci)

annotations_utts

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

[[[32, 2, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
  [52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[30, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  [20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [61, 1, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0],
  [28, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [73, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [40, 1, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[40, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0],
  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [28, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [8, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
from collections import defaultdict
import numpy as np
import pandas as pd

markers = list(deriving_collaboration_markers(utterance)[0].keys())
colab_features_mean = defaultdict(list)
colab_features_grad = defaultdict(list)

for ann_ci in annotations_utts:
    for i, marker in enumerate(markers):
        marker_values_ci = [ann_ci[j][i] for j in range(len(ann_ci))]
        # print('\n\n', marker, marker_values_ci)

        # MEAN
        colab_features_mean[marker].append(np.mean(marker_values_ci))
        
        # GRADIENT (SLOPE)
        x = np.arange(len(marker_values_ci))
        y = marker_values_ci
        try:
            slope, intercept = np.polyfit(x, y, 1) # Fitting a linear regression (polynomial of degree 1) to the data
        except Exception as e:
            print(e)
            slope = 0 # LinAlgError: SVD did not converge in Linear Least Squares
        
        colab_features_grad[marker].append(slope)        

a = pd.DataFrame(colab_features_mean).add_suffix('_mean')
b = pd.DataFrame(colab_features_grad).add_suffix('_grad')
colab_feat_df = pd.concat([a, b], axis=1)
colab_feat_df.to_csv("../1000_afd_collaboration_markers.csv", index=False)
colab_feat_df

Unnamed: 0,n_words_mean,pron_me_mean,pron_we_mean,pron_you_mean,pron_3rd_mean,geo_mean,meta_mean,certain_mean,hedge_mean,n_introduced_mean,...,pron_we_grad,pron_you_grad,pron_3rd_grad,geo_grad,meta_grad,certain_grad,hedge_grad,n_introduced_grad,n_introduced_w_certain_grad,n_introduced_w_hedge_grad
0,21.750000,0.500000,0.250000,0.000000,0.250000,0.000000,0.000000,0.000000,0.250000,0.0,...,-3.000000e-01,0.000000,-0.300000,0.000000,0.000000,0.000000e+00,-0.300000,0.0,0.0,0.0
1,24.454545,0.181818,0.181818,0.000000,0.545455,0.000000,0.181818,0.090909,0.272727,0.0,...,3.636364e-02,0.000000,0.045455,0.000000,0.072727,1.818182e-02,0.027273,0.0,0.0,0.0
2,11.500000,0.100000,0.100000,0.000000,0.200000,0.100000,0.000000,0.200000,0.100000,0.0,...,-6.060606e-03,0.000000,-0.109091,0.018182,0.000000,-1.090909e-01,-0.006061,0.0,0.0,0.0
3,26.800000,0.000000,0.200000,0.000000,0.000000,0.600000,0.000000,0.200000,0.200000,0.0,...,-1.520235e-17,0.000000,0.000000,-0.100000,0.000000,-1.520235e-17,-0.200000,0.0,0.0,0.0
4,20.444444,0.666667,0.000000,0.111111,0.000000,0.000000,0.000000,0.111111,0.777778,0.0,...,0.000000e+00,-0.016667,0.000000,0.000000,0.000000,5.000000e-02,-0.133333,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0
996,16.812500,0.000000,0.000000,0.000000,0.625000,0.125000,0.000000,0.000000,0.437500,0.0,...,0.000000e+00,0.000000,0.011765,-0.014706,0.000000,0.000000e+00,-0.001471,0.0,0.0,0.0
997,19.600000,0.600000,0.000000,0.000000,0.600000,0.000000,0.000000,0.000000,0.400000,0.0,...,0.000000e+00,0.000000,-0.100000,0.000000,0.000000,0.000000e+00,-0.200000,0.0,0.0,0.0
998,22.500000,0.333333,0.000000,0.000000,0.166667,0.666667,0.000000,0.000000,0.333333,0.0,...,0.000000e+00,0.000000,0.028571,-0.228571,0.000000,0.000000e+00,-0.228571,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import matplotlib.pyplot as plt

data = colab_feat_df
X = colab_feat_df.iloc[:,:] 
y = pd.DataFrame(labels, columns = ['label'])['label']

# Initialize KFold (we use 10-fold cross-validation)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize the models
rf_model = RandomForestClassifier(max_depth=5, random_state=42)
lr_model = LogisticRegression(random_state=42)

# Initialize the oversampler
oversampler = RandomOverSampler(random_state=42)

# To accumulate predictions and true values for each model
all_y_pred_train_rf = []
all_y_pred_train_lr = []
all_y_train = []
all_y_pred_test_rf = []
all_y_pred_test_lr = []
all_y_test = []

for train_index, test_index in kf.split(X):
    # Split into training and testing sets using .iloc for pandas
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Perform random oversampling on the training data
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
    
    # Fit the models on the oversampled training data
    rf_model.fit(X_train_resampled, y_train_resampled)
    lr_model.fit(X_train_resampled, y_train_resampled)
    
    # Training Predictions (probabilities)
    y_pred_train_rf = rf_model.predict_proba(X_train)[:, 1]
    y_pred_train_lr = lr_model.predict_proba(X_train)[:, 1]
    all_y_pred_train_rf.extend(y_pred_train_rf)
    all_y_pred_train_lr.extend(y_pred_train_lr)
    all_y_train.extend(y_train)
    
    # Testing Predictions (probabilities)
    y_pred_test_rf = rf_model.predict_proba(X_test)[:, 1]
    y_pred_test_lr = lr_model.predict_proba(X_test)[:, 1]
    all_y_pred_test_rf.extend(y_pred_test_rf)
    all_y_pred_test_lr.extend(y_pred_test_lr)
    all_y_test.extend(y_test)

# Evaluate TRAIN
auc_train_rf = roc_auc_score(all_y_train, all_y_pred_train_rf)
f1_train_rf = f1_score(all_y_train, np.round(all_y_pred_train_rf))
pr_auc_train_rf = average_precision_score(all_y_train, all_y_pred_train_rf)
auc_train_lr = roc_auc_score(all_y_train, all_y_pred_train_lr)
f1_train_lr = f1_score(all_y_train, np.round(all_y_pred_train_lr))
pr_auc_train_lr = average_precision_score(all_y_train, all_y_pred_train_lr)
print('TRAIN:')
print("Random Forest - AUROC:", auc_train_rf)
print("Random Forest - F1-Score:", f1_train_rf)
print("Random Forest - PR-AUC:", pr_auc_train_rf)
print("Logistic Regression - AUROC:", auc_train_lr)
print("Logistic Regression - F1-Score:", f1_train_lr)
print("Logistic Regression - PR-AUC:", pr_auc_train_lr)

# Evaluate TEST
auc_test_rf = roc_auc_score(all_y_test, all_y_pred_test_rf)
f1_test_rf = f1_score(all_y_test, np.round(all_y_pred_test_rf))
pr_auc_test_rf = average_precision_score(all_y_test, all_y_pred_test_rf)
auc_test_lr = roc_auc_score(all_y_test, all_y_pred_test_lr)
f1_test_lr = f1_score(all_y_test, np.round(all_y_pred_test_lr))
pr_auc_test_lr = average_precision_score(all_y_test, all_y_pred_test_lr)
print('TEST:')
print("Random Forest - AUROC:", auc_test_rf)
print("Random Forest - F1-Score:", f1_test_rf)
print("Random Forest - PR-AUC:", pr_auc_test_rf)
print("Logistic Regression - AUROC:", auc_test_lr)
print("Logistic Regression - F1-Score:", f1_test_lr)
print("Logistic Regression - PR-AUC:", pr_auc_test_lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

TRAIN:
Random Forest - AUROC: 0.8797108635998856
Random Forest - F1-Score: 0.855003940110323
Random Forest - PR-AUC: 0.9533683726998018
Logistic Regression - AUROC: 0.7501545935622853
Logistic Regression - F1-Score: 0.7963415595679255
Logistic Regression - PR-AUC: 0.9048247804167123
TEST:
Random Forest - AUROC: 0.723843738415578
Random Forest - F1-Score: 0.7870695713281798
Random Forest - PR-AUC: 0.8895933718007185
Logistic Regression - AUROC: 0.7290335593885605
Logistic Regression - F1-Score: 0.7855113636363636
Logistic Regression - PR-AUC: 0.8930113409981981


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
### BoW baseline

In [11]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd

# Assuming 'conversations' is a list or array of text conversations
# and 'labels' is a list or array of corresponding labels

# Create a DataFrame with conversations and labels
data = pd.DataFrame({'conversation': conversations, 'label': labels})

# Initialize KFold (we use 10-fold cross-validation)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize the models
rf_model = RandomForestClassifier(max_depth=5, random_state=42)
lr_model = LogisticRegression(random_state=42)

# Initialize the oversampler
oversampler = RandomOverSampler(random_state=42)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# To accumulate predictions and true values for each model
all_y_pred_train_rf = []
all_y_pred_train_lr = []
all_y_train = []
all_y_pred_test_rf = []
all_y_pred_test_lr = []
all_y_test = []

for train_index, test_index in kf.split(data):
    # Split into training and testing sets using .iloc for pandas
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]
    
    # Vectorize the conversations
    X_train = vectorizer.fit_transform(train_data['conversation'])
    X_test = vectorizer.transform(test_data['conversation'])
    y_train, y_test = train_data['label'], test_data['label']

    # Perform random oversampling on the training data
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

    # Fit the models on the oversampled training data
    rf_model.fit(X_train_resampled, y_train_resampled)
    lr_model.fit(X_train_resampled, y_train_resampled)

    # Training Predictions (probabilities)
    y_pred_train_rf = rf_model.predict_proba(X_train)[:, 1]
    y_pred_train_lr = lr_model.predict_proba(X_train)[:, 1]
    all_y_pred_train_rf.extend(y_pred_train_rf)
    all_y_pred_train_lr.extend(y_pred_train_lr)
    all_y_train.extend(y_train)

    # Testing Predictions (probabilities)
    y_pred_test_rf = rf_model.predict_proba(X_test)[:, 1]
    y_pred_test_lr = lr_model.predict_proba(X_test)[:, 1]
    all_y_pred_test_rf.extend(y_pred_test_rf)
    all_y_pred_test_lr.extend(y_pred_test_lr)
    all_y_test.extend(y_test)

# Evaluate TRAIN
auc_train_rf = roc_auc_score(all_y_train, all_y_pred_train_rf)
f1_train_rf = f1_score(all_y_train, np.round(all_y_pred_train_rf))
pr_auc_train_rf = average_precision_score(all_y_train, all_y_pred_train_rf)
auc_train_lr = roc_auc_score(all_y_train, all_y_pred_train_lr)
f1_train_lr = f1_score(all_y_train, np.round(all_y_pred_train_lr))
pr_auc_train_lr = average_precision_score(all_y_train, all_y_pred_train_lr)

print('TRAIN:')
print("Random Forest - AUROC:", auc_train_rf)
print("Random Forest - F1-Score:", f1_train_rf)
print("Random Forest - PR-AUC:", pr_auc_train_rf)
print("Logistic Regression - AUROC:", auc_train_lr)
print("Logistic Regression - F1-Score:", f1_train_lr)
print("Logistic Regression - PR-AUC:", pr_auc_train_lr)

# Evaluate TEST
auc_test_rf = roc_auc_score(all_y_test, all_y_pred_test_rf)
f1_test_rf = f1_score(all_y_test, np.round(all_y_pred_test_rf))
pr_auc_test_rf = average_precision_score(all_y_test, all_y_pred_test_rf)
auc_test_lr = roc_auc_score(all_y_test, all_y_pred_test_lr)
f1_test_lr = f1_score(all_y_test, np.round(all_y_pred_test_lr))
pr_auc_test_lr = average_precision_score(all_y_test, all_y_pred_test_lr)

print('TEST:')
print("Random Forest - AUROC:", auc_test_rf)
print("Random Forest - F1-Score:", f1_test_rf)
print("Random Forest - PR-AUC:", pr_auc_test_rf)
print("Logistic Regression - AUROC:", auc_test_lr)
print("Logistic Regression - F1-Score:", f1_test_lr)
print("Logistic Regression - PR-AUC:", pr_auc_test_lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

TRAIN:
Random Forest - AUROC: 0.887493777895153
Random Forest - F1-Score: 0.8766671661921176
Random Forest - PR-AUC: 0.9592679846486537
Logistic Regression - AUROC: 1.0
Logistic Regression - F1-Score: 1.0
Logistic Regression - PR-AUC: 1.0
TEST:
Random Forest - AUROC: 0.797875008177239
Random Forest - F1-Score: 0.8341232227488151
Random Forest - PR-AUC: 0.9276808285940266
Logistic Regression - AUROC: 0.8874484833947536
Logistic Regression - F1-Score: 0.9032258064516129
Logistic Regression - PR-AUC: 0.9501595953834541


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
