# Modeling

## Test function for automation

In [None]:
# Define column transformer to handle different types of features
text_featues = ['review_stemmed_nostop']
num_features = ['drugName_encoded','condition_encoded', 'usefulCount','day', 'month', 'year',
                   'count_word', 'count_unique_word', 'count_letters',
                   'count_punctuations', 'count_words_upper', 'count_words_title',
                   'count_stopwords', 'mean_word_len']

# Define column transformer to handle text and numeric features separately
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('text_cv', CountVectorizer(), 'review_stemmed_nostop'),
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
])

# Define pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy='stratified'))
])

# Define hyperparameters to test
params = {
    'preprocessor__text_cv__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Perform grid search with cross-validation
results = []
for param_set in ParameterGrid(params):
    pipe.set_params(**param_set)
    score = np.mean(cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scoring))
    results.append((score, param_set))
results.sort(reverse=True)

# Print train and test scores for best parameter set
best_params = results[0][1]
pipe.set_params(**best_params)
pipe.fit(X_train, y_train)
train_score = f1_score(y_train, pipe.predict(X_train))
test_score = f1_score(y_test, pipe.predict(X_test))
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')


In [None]:
# Defining a function that evaluates a given model on a dataset and returns the test and train accuracies 
# GridSearchCV is implemented to find the optimal hyperparameters

def eval_model(model, name, param_grid, scaler = None, _X_train = X_train, _X_test = X_test, _y_train = y_train, _y_test = y_test):
    # Define column transformer to handle different types of features
    num_features = ['drugName_encoded','condition_encoded', 'usefulCount','day', 'month', 'year',
                    'count_word', 'count_unique_word', 'count_letters',
                    'count_punctuations', 'count_words_upper', 'count_words_title',
                    'count_stopwords', 'mean_word_len']
    #txt_featues = 'review_stemmed_nostop'
    
    # If a scaler is provided, scale the training and test data
    if scaler:
        _X_train = scaler.fit_transform(_X_train)
        _X_test = scaler.transform(_X_test)
    
    preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    #('text_cv', CountVectorizer(), txt_features),
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])
    
    # Define the pipeline with the desired preprocessing steps and model
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    model
    ])

    # Use GridSearchCV to find the best hyperparameters for the model
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(_X_train, _y_train)
    
    # Print the name of the model and the best hyperparameters
    print(name, ":")
    print("Best hyperparameters: ", grid_search.best_params_)
    
    # Evaluate the model on the test and training data using the best hyperparameters
    test_accuracy = grid_search.score(_X_test, _y_test)
    train_accuracy = grid_search.score(_X_train, _y_train)
    
    # Print the accuracies
    print(f'Training Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy}\n')
    
    # Return the test and train accuracies
    return (test_accuracy, train_accuracy)

In [None]:


# Define the models to evaluate
models = {
    "lr": LogisticRegression(max_iter=10_000),
    "nb": MultinomialNB(),
    "rfc": RandomForestClassifier()#,
    #"MLP": MLPClassifier(max_iter=10_000)
}

# Define the hyperparameter grids for each model
param_grids = {
    "preprocessor__text_tfidf": { "ngram_range": [(1,2), (1,3), (1,4), (1,5)]},
    "lr": {"C": [0.01, 0.1, 1, 10]},
    "nb": {"alpha": [0.01, 0.1, 1.0]},
    "rfc": {"n_estimators": [10, 50, 100]}#,
   # "MLP": {"hidden_layer_sizes": [(10,), (20,), (30,)]}
}

# Evaluate each model using eval_model()
test_accs = []
train_accs = []

for name, model in models.items():
    test_acc, train_acc = eval_model(model, name, param_grid=param_grids[name], _X_train=X_train, _X_test=X_test, _y_train=y_train, _y_test=y_test,num_features, txt_featues)
    test_accs.append(test_acc)
    train_accs.append(train_acc)


In [None]:
# Plot the train and test accuracies for all models in a bar chart
labels = list(models.keys())
x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, train_accs, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, test_accs, width, label='Test Accuracy')

ax.set_ylabel('Accuracy')
ax.set_title('Train and Test Accuracies for Different Models')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()

## Baseline - Dummy Classifier

### Balanced dataset

#### Pipeline for initial model

In [None]:
## Train model - Initial
# Pipeline for initial model before hyperparameter tuning

preprocessor = ColumnTransformer([
    #('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ('text_cv', CountVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
model = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the pipeline to the training data
pipeline.fit(X_train_balanced, y_train_balanced)

# Evaluate the pipeline on the test data
train_score = pipeline.score(X_train_balanced, y_train_balanced)
test_score = pipeline.score(X_test, y_test)


# Print the scores
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
## Evaluation
# use the pipeline to make predictions on the test data
y_pred = pipeline.predict(X_test)

# print the classification report
print(classification_report(y_test, y_pred))


### Imbalanced dataset

#### Initial model

In [None]:
# Initial
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
model = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)


# Print the scores
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
## arkiv
## Defining funciton for pipeline
# The function evaluates a given model on a dataset and returns the test and train scores 

def model_pipeline(model, text_processor=None, num_processor=None,_X_train = X_train, _X_test = X_test, _y_train = y_train, _y_test = y_test):
    # Define column transformer to handle different types of features
    num_features = ['drugName_encoded','condition_encoded', 'usefulCount','day', 'month', 'year',
                    'count_word', 'count_unique_word', 'count_letters',
                    'count_punctuations', 'count_words_upper', 'count_words_title',
                    'count_stopwords', 'mean_word_len']
    ##1
    # if text_processor or num_processor:
    #     preprocessor = ColumnTransformer([
    #         ('num', num_processor, num_features),
    #         #('text_cv', CountVectorizer(), txt_features),
    #         ('text', text_processor, 'review_stemmed_nostop')
    #         ])
    # else: 
    #     preprocessor = FunctionTransformer(lambda x: x, validate=False)
    
    ## 2 - virker
    # preprocessor_transformers = []
    # if num_processor:
    #     preprocessor_transformers.append(('num', num_processor, num_features))
    # if text_processor:
    #     preprocessor_transformers.append(('text', text_processor, 'review_stemmed_nostop'))

    # if preprocessor_transformers:
    #     preprocessor = ColumnTransformer(preprocessor_transformers)
    # else:
    #     preprocessor = FunctionTransformer(lambda x: x, validate=False)

    # 3 test
    preprocessor_transformers = []
    if num_processor:
        preprocessor_transformers.append(('num', num_processor, num_features))
    if text_processor:
        preprocessor_transformers.append(('text', text_processor, 'review_stemmed_nostop'))
        
    preprocessing = ColumnTransformer(preprocessor_transformers) if preprocessor_transformers else None

    print(preprocessor_transformers)
    print(bool(num_processor))
    print(bool(text_processor))
    print(preprocessing)


    # Define the pipeline with the desired preprocessing steps and model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model),
        ])

    # Train model - fit pipeline to training set
    pipeline.fit(_X_train, _y_train)

    #  Predict y in test set
    y_pred = pipeline.predict(X_test)
    # Evaluate the model on the test and training data u
    train_score = pipeline.score(_X_train, _y_train)
    test_score = pipeline.score(_X_test, _y_test)
    cm = confusion_matrix(y_test, y_pred)

    # Print the accuracies
    print(f'Train score: {train_score:.4f}')
    print(f'Test score: {test_score:.4f}\n')
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("Confusion Matrix:")
    print(cm)
    
    # Print the confusion matrix
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

    # Return the test and train accuracies
    return (test_score, train_score, y_pred)

In [None]:
## Evaluation 

# use the pipeline to make predictions on the test data
y_pred = pipeline.predict(X_test)

# print the classification report
print(classification_report(y_test, y_pred))


In [None]:
## udklip
    preprocessor_transformers = []
    if num_processor:
        preprocessor_transformers.append(('num', num_processor, num_features))
    if text_processor:
        preprocessor_transformers.append(('text', text_processor, 'review_stemmed_nostop'))

    # if preprocessor_transformers:
    #     preprocessor = ColumnTransformer(preprocessor_transformers)
    # else:
    #     preprocessor = FunctionTransformer(lambda x: x, validate=False)
        
    #preprocessing = ColumnTransformer(preprocessor_transformers) if preprocessor_transformers else 'passthrough'

    print(preprocessor_transformers)
    print(bool(num_processor))
    print(bool(text_processor))
  

    if preprocessor_transformers: 
        # Define the pipeline with the desired preprocessing steps and model
        pipeline = Pipeline([
            ('preprocessor', ColumnTransformer(preprocessor_transformers)),
            ('classifier', model),
            ])
    else:
          pipeline = Pipeline([
            ('classifier', model),
            ])

#### Hyperparameter tuning

In [None]:
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
models = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier())
])

# Define hyperparameters to test
params = {
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent', 'prior', 'uniform']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Use GridSearchCV to find the best hyperparameters for the model
grid_search = GridSearchCV(pipeline, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the name of the model and the best hyperparameters
print("Dummy Classifier:")
print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
# Evaluate the model on the test and training data using the best hyperparameters
test_score = grid_search.score(X_test, y_test)
train_score = grid_search.score(X_train, y_train)

# Print the accuracies
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')


In [None]:
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Hyperparameter tuning

### Balanced dataset

In [None]:
## GridsearchCV for hyperparameter tuning
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
models = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier())
])

# Define hyperparameters to test
params = {
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent', 'prior', 'uniform']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Use GridSearchCV to find the best hyperparameters for the model
grid_search = GridSearchCV(pipeline, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Print the name of the model and the best hyperparameters
print("Dummy Classifier:")
print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
## Model evaluation
# Evaluate the model on the test and training data using the best hyperparameters
test_score = grid_search.score(X_test, y_test)
train_score = grid_search.score(X_train_balanced, y_train_balanced)

# Print the accuracies
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
# Classification report
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
val_numerical_features_array, val_categorical_features_dict, val_text_features_array, val_labels_array = prepare_data(X_val_bert_balanced, y_val_bert_balanced)

# Evaluate the final model on the validation set
model_bert.evaluate(
    {
        'numerical': val_numerical_features_array,
        'drug_input': val_categorical_features_dict['drugName'],
        'condition_input': val_categorical_features_dict['condition'],
        'text': val_text_features_array
    }, val_labels_array,
)

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(
    x=[        test_numerical_features,        test_categorical_features['condition'],
        test_categorical_features['drugName'],
        test_text_features
    ],
    y=y_test
)

## BERT

In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense, concatenate

# # Define input layers
# text_input = Input(shape=(None,), dtype=tf.string, name='text_input')
# numeric_input = Input(shape=(6,), name='numeric_input') # assuming 4 numeric features

# # Define embedding layer for text input
# embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(text_input)

# # Define dense layer for numerical input
# dense_layer = Dense(8, activation='relu')(numeric_input)

# # Concatenate the two layers
# concat_layer = concatenate([embedding_layer, dense_layer])

# # Define output layer
# output_layer = Dense(1, activation='sigmoid')(concat_layer)

# # Define the model
# model = tf.keras.models.Model(inputs=[text_input, numeric_input], outputs=output_layer)



In [None]:
# ## chatGPT
# # Load BERT model and preprocessing module
# bert_model = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3')
# bert_prep = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

# # Load data
# df = pd.read_csv('reviews.csv')

# # Preprocess categorical features
# cat_cols = ['drugName', 'condition']
# for col in cat_cols:
#     df[col] = pd.Categorical(df[col])
#     df[col] = df[col].cat.codes

# # Preprocess numerical features
# num_cols = ['usefulCount', 'day', 'month', 'year']
# for col in num_cols:
#     df[col] = (df[col] - df[col].mean()) / df[col].std()

# # Concatenate features
# X = df[['drugName', 'condition', 'usefulCount', 'day', 'month', 'year', 'review_clean']]
# y = df['sentiment']

# X_cat = tf.keras.layers.Input(shape=(2,), dtype=tf.int32, name='categorical')
# X_num = tf.keras.layers.Input(shape=(4,), dtype=tf.float32, name='numerical')
# X_text = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# # Preprocess input text using BERT preprocess layer
# text_prep = bert_prep(X_text)

# # Concatenate categorical, numerical, and text features
# X_cat_emb = tf.keras.layers.Embedding(input_dim=5000, output_dim=10)(X_cat)
# X_cat_flat = tf.keras.layers.Flatten()(X_cat_emb)
# X_num_dense = tf.keras.layers.Dense(10, activation='relu')(X_num)
# X_text_enc = bert_model(text_prep)['pooled_output']
# X_concat = tf.keras.layers.concatenate([X_cat_flat, X_num_dense, X_text_enc], axis=-1)

# # Define dropout and output layers for NN
# dropout_layer = tf.keras.layers.Dropout(0.1, name='dropout')(X_concat)
# output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(dropout_layer)

# # Combine all layers to create the final model
# model = tf.keras.Model(inputs=[X_cat, X_num, X_text], outputs=[output_layer])

# # Compile the model


In [None]:
# Concatenate features
#X = df[['drugName', 'condition', 'usefulCount', 'day', 'month', 'year', 'review_clean']]
#y = df['sentiment']

cat_input_condition = Input(shape=(1,), dtype=tf.string, name='condition_input')
cat_input_drug = Input(shape=(1,), dtype=tf.string, name='drug_input')
num_input = tf.keras.layers.Input(shape=(4,), dtype=tf.float32, name='numerical')
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

In [None]:
# preprocess categorical features
   
# create a list of unique conditions and drug names
# unique_conditions = df['condition'].unique().tolist()
# unique_drugNames = df['drugName'].unique().tolist()
vocab_condition = df['condition'].unique().tolist()
vocab_drug = df['drugName'].unique().tolist()

# define categorical feature columns for 'condition' and 'drugName'
# cond_feature_col = tf.feature_column.categorical_column_with_vocabulary_list('condition', unique_conditions)
# drug_feature_col = tf.feature_column.categorical_column_with_vocabulary_list('drugName', unique_drugNames)
cat_condition = categorical_column_with_vocabulary_list(key='condition', vocabulary_list=vocab_condition)
cat_drug = categorical_column_with_vocabulary_list(key='drugName', vocabulary_list=vocab_drug)

# one-hot encode categorical feature columns
# cond_onehot_col = tf.feature_column.indicator_column(cond_feature_col)
# drug_onehot_col = tf.feature_column.indicator_column(drug_feature_col)

# Embedding categorical columns
encoded_condition = embedding_column(cat_condition, dimension=8)
encoded_drug = embedding_column(cat_drug, dimension=16)

# Concatenating categorical colums
cat_cols = [encoded_condition, encoded_drug]


In [None]:
# Preprocess numerical features (standardScaler)

# StandardScaler object
scaler = StandardScaler()

# Normalize numerical features
num_cols = ['usefulCount', 'day', 'month', 'year']
df[num_cols] = scaler.fit_transform(df[num_cols])
num_cols_scaled = df[num_cols]
# # Scale numerical features
#num_cols_scaled = tf.keras.layers.experimental.preprocessing.Normalization()(num_input)

# function to scale
def num_scale(feature):
  scaler = StandardScaler()
  scaler.fit(train[[feature]])
  def standardize(x):
    return scaler.transform([[x]])[0][0]
  return standardize

In [None]:
# Preprocess text using BERT preprocess layer
text_prep = bert_prep(text_input)

# Encode text using bert model encoder
text_enc = bert_enc(text_prep)['pooled_output']

In [None]:
# Concatenating categorical, numerical, and text features

#X_concat = tf.keras.layers.concatenate([X_cat_flat, X_num_dense, X_text_enc], axis=-1)

# Define dropout layer
dropout_layer = tf.keras.layers.Dropout(0.1, name='dropout')(text_enc)
# Concatenating categorical, numerical, and text features
concat_layer = Concatenate()([dropout_layer, num_cols_scaled, encoded_condition, encoded_drug])

# Define output layer for NN
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(concat_layer)

# Define dropout and output layers for NN - old
# dropout_layer = tf.keras.layers.Dropout(0.1, name='dropout')(X_concat)
# output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(dropout_layer)

# Combine all layers to create the final model
model_bert = tf.keras.Model(inputs=[text_input, num_input, cat_input_condition, cat_input_drug], outputs=[output_layer])