# Modeling

## Test function for automation

In [None]:
# Define column transformer to handle different types of features
text_featues = ['review_stemmed_nostop']
num_features = ['drugName_encoded','condition_encoded', 'usefulCount','day', 'month', 'year',
                   'count_word', 'count_unique_word', 'count_letters',
                   'count_punctuations', 'count_words_upper', 'count_words_title',
                   'count_stopwords', 'mean_word_len']

# Define column transformer to handle text and numeric features separately
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('text_cv', CountVectorizer(), 'review_stemmed_nostop'),
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
])

# Define pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier(strategy='stratified'))
])

# Define hyperparameters to test
params = {
    'preprocessor__text_cv__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Perform grid search with cross-validation
results = []
for param_set in ParameterGrid(params):
    pipe.set_params(**param_set)
    score = np.mean(cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scoring))
    results.append((score, param_set))
results.sort(reverse=True)

# Print train and test scores for best parameter set
best_params = results[0][1]
pipe.set_params(**best_params)
pipe.fit(X_train, y_train)
train_score = f1_score(y_train, pipe.predict(X_train))
test_score = f1_score(y_test, pipe.predict(X_test))
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')


In [None]:
# Defining a function that evaluates a given model on a dataset and returns the test and train accuracies 
# GridSearchCV is implemented to find the optimal hyperparameters

def eval_model(model, name, param_grid, scaler = None, _X_train = X_train, _X_test = X_test, _y_train = y_train, _y_test = y_test):
    # Define column transformer to handle different types of features
    num_features = ['drugName_encoded','condition_encoded', 'usefulCount','day', 'month', 'year',
                    'count_word', 'count_unique_word', 'count_letters',
                    'count_punctuations', 'count_words_upper', 'count_words_title',
                    'count_stopwords', 'mean_word_len']
    #txt_featues = 'review_stemmed_nostop'
    
    # If a scaler is provided, scale the training and test data
    if scaler:
        _X_train = scaler.fit_transform(_X_train)
        _X_test = scaler.transform(_X_test)
    
    preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    #('text_cv', CountVectorizer(), txt_features),
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])
    
    # Define the pipeline with the desired preprocessing steps and model
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    model
    ])

    # Use GridSearchCV to find the best hyperparameters for the model
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(_X_train, _y_train)
    
    # Print the name of the model and the best hyperparameters
    print(name, ":")
    print("Best hyperparameters: ", grid_search.best_params_)
    
    # Evaluate the model on the test and training data using the best hyperparameters
    test_accuracy = grid_search.score(_X_test, _y_test)
    train_accuracy = grid_search.score(_X_train, _y_train)
    
    # Print the accuracies
    print(f'Training Accuracy: {train_accuracy}')
    print(f'Test Accuracy: {test_accuracy}\n')
    
    # Return the test and train accuracies
    return (test_accuracy, train_accuracy)

In [None]:


# Define the models to evaluate
models = {
    "lr": LogisticRegression(max_iter=10_000),
    "nb": MultinomialNB(),
    "rfc": RandomForestClassifier()#,
    #"MLP": MLPClassifier(max_iter=10_000)
}

# Define the hyperparameter grids for each model
param_grids = {
    "preprocessor__text_tfidf": { "ngram_range": [(1,2), (1,3), (1,4), (1,5)]},
    "lr": {"C": [0.01, 0.1, 1, 10]},
    "nb": {"alpha": [0.01, 0.1, 1.0]},
    "rfc": {"n_estimators": [10, 50, 100]}#,
   # "MLP": {"hidden_layer_sizes": [(10,), (20,), (30,)]}
}

# Evaluate each model using eval_model()
test_accs = []
train_accs = []

for name, model in models.items():
    test_acc, train_acc = eval_model(model, name, param_grid=param_grids[name], _X_train=X_train, _X_test=X_test, _y_train=y_train, _y_test=y_test,num_features, txt_featues)
    test_accs.append(test_acc)
    train_accs.append(train_acc)


In [None]:
# Plot the train and test accuracies for all models in a bar chart
labels = list(models.keys())
x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, train_accs, width, label='Train Accuracy')
rects2 = ax.bar(x + width/2, test_accs, width, label='Test Accuracy')

ax.set_ylabel('Accuracy')
ax.set_title('Train and Test Accuracies for Different Models')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()

## Baseline - Dummy Classifier

### Balanced dataset

#### Pipeline for initial model

In [None]:
## Train model - Initial
# Pipeline for initial model before hyperparameter tuning

preprocessor = ColumnTransformer([
    #('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ('text_cv', CountVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
model = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the pipeline to the training data
pipeline.fit(X_train_balanced, y_train_balanced)

# Evaluate the pipeline on the test data
train_score = pipeline.score(X_train_balanced, y_train_balanced)
test_score = pipeline.score(X_test, y_test)


# Print the scores
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
## Evaluation
# use the pipeline to make predictions on the test data
y_pred = pipeline.predict(X_test)

# print the classification report
print(classification_report(y_test, y_pred))


### Imbalanced dataset

#### Initial model

In [None]:
# Initial
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
model = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)


# Print the scores
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
## Evaluation 

# use the pipeline to make predictions on the test data
y_pred = pipeline.predict(X_test)

# print the classification report
print(classification_report(y_test, y_pred))


#### Hyperparameter tuning

In [None]:
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
models = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier())
])

# Define hyperparameters to test
params = {
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent', 'prior', 'uniform']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Use GridSearchCV to find the best hyperparameters for the model
grid_search = GridSearchCV(pipeline, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the name of the model and the best hyperparameters
print("Dummy Classifier:")
print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
# Evaluate the model on the test and training data using the best hyperparameters
test_score = grid_search.score(X_test, y_test)
train_score = grid_search.score(X_train, y_train)

# Print the accuracies
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')


In [None]:
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Hyperparameter tuning

### Balanced dataset

In [None]:
## GridsearchCV for hyperparameter tuning
preprocessor = ColumnTransformer([
    ('text_tfidf', TfidfVectorizer(), 'review_stemmed_nostop')
    ])

# Define the models to evaluate
models = DummyClassifier()

# Define the pipeline with the desired preprocessing steps and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DummyClassifier())
])

# Define hyperparameters to test
params = {
    'preprocessor__text_tfidf__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'classifier__strategy': ['stratified', 'most_frequent', 'prior', 'uniform']
}

# Define cross-validation settings
cv = 5
scoring = 'f1'

# Use GridSearchCV to find the best hyperparameters for the model
grid_search = GridSearchCV(pipeline, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Print the name of the model and the best hyperparameters
print("Dummy Classifier:")
print("Best hyperparameters: ", grid_search.best_params_)

In [None]:
## Model evaluation
# Evaluate the model on the test and training data using the best hyperparameters
test_score = grid_search.score(X_test, y_test)
train_score = grid_search.score(X_train_balanced, y_train_balanced)

# Print the accuracies
print(f'Train score: {train_score:.4f}')
print(f'Test score: {test_score:.4f}')

In [None]:
# Classification report
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))