In [10]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from lime.lime_text import LimeTextExplainer

import time


## Binary Classification

Attempting GridSearchCV for hyperparameter selection. The penalty is elasticnet, and saga is the solver. We used a max iteration count of 2000 to ensure convergence due to the large dataset size.

In [16]:
num_splits = 5
cv_k = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

param_grid = {
    'C': [1, 3, 5, 10],
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5],
    'max_iter': [2000],
}

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_data/split_{2}/train.csv'
    test_file_path = f'./F24_Proj3_data/split_{2}/test.csv'
    test_y_file_path = f'./F24_Proj3_data/split_{2}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    log_reg = LogisticRegression()
    
    grid_search = GridSearchCV(estimator=log_reg, 
                               param_grid = param_grid,
                               cv=cv_k,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy')
    
    grid_search.fit(X_train, y_train)

    print("\nFold ", i+1) 
    # Access the best parameters and best estimator
    print("Best hyperparameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  0
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  1
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  2
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  3
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 

Now we'll train the logistic regression model using the best C and l1 ratio we found earlier. C = 10, l1 = 0.1.

In [17]:
num_splits = 5

log_reg = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.1,  
    C=10,
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_data/split_{i+1}/train.csv'
    test_file_path = f'./F24_Proj3_data/split_{i+1}/test.csv'
    test_y_file_path = f'./F24_Proj3_data/split_{i+1}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    log_reg.fit(X_train, y_train)

    y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f'Split {i+1}: AUC Score for LogisticRegression: {auc_score:.7f}')
    print(f'| Execution time: {round(time.time() - start_time, 4)} seconds')

Split 1: AUC Score for LogisticRegression: 0.9870942
| Execution time: 30.1842 seconds
Split 2: AUC Score for LogisticRegression: 0.9867907
| Execution time: 30.1352 seconds
Split 3: AUC Score for LogisticRegression: 0.9864187
| Execution time: 30.8156 seconds
Split 4: AUC Score for LogisticRegression: 0.9869783
| Execution time: 31.2468 seconds
Split 5: AUC Score for LogisticRegression: 0.9862662
| Execution time: 31.1201 seconds


## Interpretability Analysis

Using split 1 and the corresponding trained model, implement an interpretability approach to identify which parts of each review have an impact on the sentiment prediction. Apply your method to 5 randomly selected positive reviews and 5 randomly selected negative reviews from the split 1 test data.

In [18]:
train_file_path = f'./F24_Proj3_data/split_1/train.csv'
test_file_path = f'./F24_Proj3_data/split_1/test.csv'
test_y_file_path = f'./F24_Proj3_data/split_1/test_y.csv'

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
test_y = pd.read_csv(test_y_file_path)

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
             'you', 'your', 'yours', 'their', 'they', 'his', 'her', 'she',
             'he', 'a', 'an', 'and', 'is', 'was', 'are', 'were', 'him',
             'himself', 'has', 'have', 'it', 'its', 'the', 'us']

We are doing some preprocessing:
- Replacing HTML tags from reviews with a space character
- Removing stop words
- Convert to lowercase
- remove rarely used words
Note: The token pattern below treats words separated by apostrophes as a single token rather two splitting it into two tokens. 

In [19]:
train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer
)

dtm_train = vectorizer.fit_transform(train['review'])
# dtm_test = vectorizer.fit_transform(test['review'])

# log_reg.fit(dtm_train, train['sentiment'])

# Make predictions on the test data
# preds = log_reg.predict_proba(dtm_test)[:, 1]  # Get the probabilities for class 1

# print(preds)

In [None]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(dtm_train)

lasso_model = Lasso(alpha=0.00623)

lasso_model.fit(X_train_scaled, train['sentiment'])

# Get the coefficients
lasso_coefs = lasso_model.coef_
print('Lasso Coefficients:', lasso_coefs.sum())

# Create a DataFrame with feature names and their coefficients
feature_coef_df = pd.DataFrame(
    {'Feature': np.array(vectorizer.get_feature_names_out()),
     'Coefficient': lasso_coefs})

selected_features = feature_coef_df[feature_coef_df['Coefficient'] != 0]
vocabulary = list(selected_features['Feature'].values)

with open('./myvocab.txt', 'w') as file:
    # Iterate through the list and write each word to a new line
    for word in vocabulary:
        file.write(word + "\n")

len(selected_features)

Setting a seed for consistency in random selection.

In [None]:
random.seed(42)

In [None]:
myvocab = open("myvocab.txt", "r").read().splitlines()
tfidf_vectorizer = TfidfVectorizer(vocabulary=myvocab, ngram_range=(1, 4))
dtm_test = tfidf_vectorizer.fit_transform(dtm_train['review'])
# log_reg.fit(dtm_test, test_y['sentiment'])
# print(myvocab)
# print(test_y)
# print(dtm_test)

# Need to set a seed before selecting samples

positive_reviews = test[test_y['sentiment'] == 1].sample(5, random_state=42)
negative_reviews = test[test_y['sentiment'] == 0].sample(5, random_state=42)
selected_reviews = pd.concat([positive_reviews, negative_reviews])

explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])

def explain_review(review_text):
    print(review_text)
    explanation = explainer.explain_instance(review_text, log_reg.predict_proba, num_features=10)
    
    explanation.show_in_notebook(text=True)

for i, row in selected_reviews.iterrows():
    explain_review(row['review'])
    
