## Text Classification for IMDB Movie Reviews
### Uses XGBoost on TF-IDF vector of the data
#### Steps:
1. Load Train and Test Data
2. Create TF-IDF Matrices. Includes Fitting a count vectorizer and a tfidf vectorizer
3. Fit XGB Classifier
4. Evaluate Model Performance
5. Log Model Outcome

### Load Data

In [None]:
from datasets import load_dataset

imdb_train = load_dataset("imdb", split="train")
imdb_test = load_dataset("imdb", split="test")

#### Create TF-IDF Train and Test Values

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

# Fit the countvectorizer
count_vect.fit(imdb_train['text'])

# Apply the count fit
X_train_counts = count_vect.transform(imdb_train['text'])
X_test_counts = count_vect.transform(imdb_test['text'])

# Fit the tfidf transformer
tfidf_transformer.fit(X_train_counts)

# Apply the fit to the train/test
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

#### Fit XGB on TF-IDF

In [None]:
from xgboost import XGBClassifier

# create model instance
bst = XGBClassifier(n_estimators=5, max_depth=8, learning_rate=1, objective='binary:logistic')

# fit model
bst.fit(X_train_tfidf, imdb_train['label'])

#### Compute Model Performance

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
# make predictions for test data
y_test_prob = [x[1] for x in bst.predict_proba(X_test_tfidf)]
threshold = 0.5
y_test_pred = [value >= threshold for value in y_test_prob]

#### Evaluate Model Performance

In [None]:
def get_sklearn_metrics(y_test, predictions, probabilities):
    mp = {}
    mp['accuracy'] = accuracy_score(y_test, predictions)
    mp['f1'] = f1_score(y_test, predictions)
    mp['precision'] = precision_score(y_test, predictions)
    mp['recall'] = recall_score(y_test, predictions)
    mp['roc_auc'] = roc_auc_score(y_test, probabilities)
    print(mp)
    return mp
model_performance = get_sklearn_metrics(imdb_test['label'], y_test_pred, y_test_prob)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(imdb_test['label'], y_test_prob)

plt.show()

### Log Model Outcome

In [None]:
from datetime import datetime
def log_model(model, user: str, model_type: str, data_transform: str, model_params: dict, model_perf: dict, y_test_prob, y_test_pred):
    # Save model file
    if type(model) == 'xgboost.sklearn.XGBClassifier':
        model.save_model(f'{model_type}__{user}_{str(datetime.now())}.json')
        
    # Create model dictionary
    model_record = {'model_type':model_type, 'data_transform':data_transform, 'model_params':model_params, 'model_perf':model_perf}
    
    # Append to model logs from somewhere
    
    # Save model predictions somewhere
    
    
log_model(bst, 'user1', 'xgboost', 'tfidf', bst.get_params(), model_performance, y_test_prob, y_test_pred)

### Analyze Feature Importance

In [None]:
sorted_idx = bst.feature_importances_.argsort()
sorted_idx

In [None]:
count_vect.get_feature_names_out()

In [None]:
plt.barh(count_vect.get_feature_names_out()[sorted_idx[:25]], bst.feature_importances_[sorted_idx[:25]])

#### Use Shapely Values to Estimate Feature Importance

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test_tfidf)

In [None]:
shap.summary_plot(shap_values, X_test_counts, plot_type="bar")