In [13]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from lime.lime_text import LimeTextExplainer

import time


## Binary Classification

Attempting GridSearchCV for hyperparameter selection. The penalty is elasticnet, and saga is the solver. We used a max iteration count of 2000 to ensure convergence due to the large dataset size.

In [16]:
num_splits = 5
cv_k = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

param_grid = {
    'C': [1, 3, 5, 10],
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5],
    'max_iter': [2000],
}

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_data/split_{2}/train.csv'
    test_file_path = f'./F24_Proj3_data/split_{2}/test.csv'
    test_y_file_path = f'./F24_Proj3_data/split_{2}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    log_reg = LogisticRegression()
    
    grid_search = GridSearchCV(estimator=log_reg, 
                               param_grid = param_grid,
                               cv=cv_k,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy')
    
    grid_search.fit(X_train, y_train)

    print("\nFold ", i+1) 
    # Access the best parameters and best estimator
    print("Best hyperparameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  0
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  1
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  2
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Fold  3
Best hyperparameters: {'C': 10, 'l1_ratio': 0.1, 'max_iter': 2000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best cross-validation score: 0.94784
Test set accuracy: 0.94772
Fitting 5 folds for each of 

Now we'll train the logistic regression model using the best C and l1 ratio we found earlier. C = 10, l1 = 0.1.

In [None]:
num_splits = 5

log_reg = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.1,  
    C=10,
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_data/split_{i+1}/train.csv'
    test_file_path = f'./F24_Proj3_data/split_{i+1}/test.csv'
    test_y_file_path = f'./F24_Proj3_data/split_{i+1}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    log_reg.fit(X_train, y_train)

    y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f'Split {i+1}: AUC Score for LogisticRegression: {auc_score:.7f}')
    print(f'| Execution time: {round(time.time() - start_time, 4)} seconds')

Split 1: AUC Score for LogisticRegression: 0.9870942
| Execution time: 29.1046 seconds
Split 2: AUC Score for LogisticRegression: 0.9867907
| Execution time: 29.5183 seconds
Split 3: AUC Score for LogisticRegression: 0.9864187
| Execution time: 29.9895 seconds
Split 4: AUC Score for LogisticRegression: 0.9869783
| Execution time: 29.8482 seconds


In [12]:
import joblib

In [None]:
joblib.dump(model, 'logistic_model.pkl')

## Interpretability Analysis

Using split 1 and the corresponding trained model, implement an interpretability approach to identify which parts of each review have an impact on the sentiment prediction. Apply your method to 5 randomly selected positive reviews and 5 randomly selected negative reviews from the split 1 test data.

We will use the bag of words approach, which disregards word order and context, representing each review by presence or absence of words and their frequency.

In [8]:
train_file_path = f'./F24_Proj3_data/split_1/train.csv'
test_file_path = f'./F24_Proj3_data/split_1/test.csv'
test_y_file_path = f'./F24_Proj3_data/split_1/test_y.csv'

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
test_y = pd.read_csv(test_y_file_path)

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
             'you', 'your', 'yours', 'their', 'they', 'his', 'her', 'she',
             'he', 'a', 'an', 'and', 'is', 'was', 'are', 'were', 'him',
             'himself', 'has', 'have', 'it', 'its', 'the', 'us']

We are doing some preprocessing:
- Replacing HTML tags from reviews with a space character
- Removing stop words
- Convert to lowercase
- remove rarely used words
Note: The token pattern below treats words separated by apostrophes as a single token rather two splitting it into two tokens.

CountVectorizer converts the dataset into a matrix of token counts, which is the bag of words model. Each row is a review, and each column is a unique token in the entire dataset. The value represents the token count in the document.

In [22]:
train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer
)

dtm_train = vectorizer.fit_transform(train['review'])

For sentiment analysis, we will use LIME. 

In [34]:
random.seed(42)

positive_reviews = test[test_y['sentiment'] == 1].sample(5, random_state=42)
negative_reviews = test[test_y['sentiment'] == 0].sample(5, random_state=42)
selected_reviews = pd.concat([positive_reviews, negative_reviews])

In [38]:
explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])

def explain_review(review_text):
    #print(review_text)
    explanation = explainer.explain_instance(review_text, log_reg.predict_proba)
    
    explanation.show_in_notebook(text=True)

for i, row in selected_reviews.iterrows():
    explain_review(row['review'])




ValueError: Expected 2D array, got 1D array instead:
array=['Well, I fear that my review of this special won\'t heed much different observation than the others before me, but I literally just watched it- during a PBS membership drive- and frankly I\'m too excited NOT to say anything. To really appreciate the enigma that is Barbra Streisand, you have to look back before the movies. Before the Broadway phenomenon of the mid-60\'s. When television was still a young medium, there was a form of entertainment very prominent on the air that is but a memory today: musical variety. Some musical shows were weekly series, but others were single, one-time specials, usually showcasing the special talent of the individual performer. This is where we get the raw, uninhibited first looks at Streisand. She had already been a guest performer on other variety shows including Garry Moore, Ed Sullivan, and scored a major coup in a one-time only tandem appearance with the woman who would pass her the baton of belter extraordinary: Judy Garland. In 1966, COLOR ME BARBRA introduced Barbra Streisand in color (hence the title), but copied the format of her first special a year earlier almost to the letter. In 3 distinct acts, we get an abstract Streisand (in an after-hours art museum looking at and sometimes becoming the works of art), a comic Streisand working an already adoring audience in a studio circus (populated with many fuzzy and furry animals), and best of all, a singing Streisand in mini-concert format just-- well, frankly, just doing it. <br /><br />It amazes me that she still had the film debut of FUNNY GIRL yet to come, as well as turns as songwriter, director, and political activist. Here, she is barely 24 years old, doing extraordinary things because, as she puts it in her own on-camera introduction, \'we didn\'t know we couldn\'t, so we did.\' The art museum sequence is shot in Philadelphia over one weekend immediately after the museum closed to the public on Saturday evening, and apparently done with only ONE color camera. Yet there are cuts, dissolves, and tracking shots galore, resulting in one rather spectacular peak moment-- the modern, slightly beatnik-flavored, \\Gotta Move.\\" After getting lost amongst the modern abstracts, jazz-club bongos begin, with Streisand emerging in a psychedelic gown and glittering eye makeup, doing the catchy staccato tune with almost androgynous sex appeal. It is not until Act 3, believe it or not, that the moment is matched or bettered by another feat: in the concert sequence, in a white gown and pearl earrings, Streisand recites the torchy \\"Any Place I Hang My Hat is Home,\\" tearing into the final notes and revealing one of those climactic belts that makes you scream like a little girl even if you\'re 44 years old...and a guy. Just plain old great television. Check it out."'
 ',    my     \'         , but     - during a PBS  -   \'   NOT   .     enigma    ,        .       -\'.     a young ,   a           but a  today:  .      , but   , - ,         performer.    we   ,     .     a  performer       Moore,  ,   a    a -    with     pass      :  .  ,         (  ), but        a      .    , we     (  -         works  ), a comic        a   ( with many   furry ),    , a    -  -- , ,   . < />< />             yet  , as  as  as , ,   . ,     years ,    , as  puts     - , \'we \' know we \',  we .\'                      ,    with    .    , ,   shots ,       --  ,  -, \\ .\\"       , -  begin, with    a      ,     tune with    .      ,    ,          :    ,  a white    ,  recites   \\"       ,\\" tearing      revealing           a little    \'  years ... a .     .   ."'
 'Well, I fear that  review  this special won\' heed  different observation  the others before me, but I literally just  it- during   membership drive- and frankly I\'m too excited  to say anything.  really appreciate the  that is Barbra Streisand,   to  back before the movies. Before the Broadway phenomenon  the mid-60\'s.    still  young medium,    form   very  on the air that is but  memory today: musical variety. Some musical   weekly series, but others  , one- specials, usually  the special   the individual performer. This is where we get the ,  first looks  Streisand. She  already been  guest performer on other variety  including Garry Moore, Ed Sullivan, and    coup   one- only tandem  with the woman who  pass her the baton   : Judy . In 1966, COLOR ME BARBRA introduced Barbra Streisand  color (hence the title), but copied the format  her first special  year earlier almost to the letter. In 3 distinct acts, we get an abstract Streisand ( an after-hours art  looking  and sometimes becoming the works  art),  comic Streisand  an already adoring audience   studio circus (populated with many fuzzy and furry animals), and   all,  singing Streisand  mini-concert format just-- well, frankly, just  it. <br /><br />It amazes me that  still  the film debut  FUNNY GIRL  to come, as well as turns as , director, and political activist. Here,  is barely 24 years old,   things because, as  puts it  her own on-camera introduction, \'we didn\' know we couldn\', so we did.\' The art  sequence is shot  Philadelphia over one  immediately after the  closed to the public on Saturday evening, and apparently done with only ONE color camera.   are , dissolves, and tracking  galore,   one  spectacular  moment-- the modern, slightly -flavored, \\Gotta Move.\\" After getting  amongst the modern abstracts, jazz- bongos begin, with Streisand     gown and glittering eye makeup,  the catchy staccato tune with almost  sex . It is not   3, believe it or not, that the moment is matched or bettered  another feat:  the concert sequence,   white gown and pearl earrings, Streisand recites the  \\"Any  I Hang My Hat is Home,\\"  into the final notes and  one  those climactic  that makes  scream like   girl even if \'re 44 years old...and  guy. Just plain old great . Check it ."'
 ...
 ', I   my  of  special won\' heed    than  others before ,  I literally   - during a PBS membership -   I\'m    to  .     enigma   Barbra , you  to look  before  . Before  Broadway  of  mid-\'. When   still a young ,   a  of      air    a memory : musical variety.  musical shows  weekly series,  others  single, -time specials,    special  of   . This   we get  , uninhibited first looks  . She  already  a    other variety shows including  ,  Sullivan,  scored a  coup in a -time      woman  would pass   baton of  extraordinary: Judy Garland. In , COLOR   introduced Barbra  in  (hence  ),     of  first special a year earlier almost to  letter. In   , we get an abstract  (in an - art museum looking   sometimes becoming  works of art), a    an already  audience in a  circus (populated     furry animals),  best of all, a singing  in mini-  -- , ,  doing . <br /><br /> amazes   she still   film debut of FUNNY  yet to come,    turns  songwriter, ,   activist. , she   24  , doing extraordinary  because,  she   in   -camera introduction, \'we didn\'  we \',  we did.\' The art museum    in Philadelphia       museum closed to  public   evening,  apparently done     camera. Yet  are cuts, dissolves,  tracking  ,  in  rather spectacular peak moment--  ,  -flavored, \\ Move.\\" After getting lost    , jazz-club bongos begin,    in a  gown   eye , doing   staccato   almost  sex appeal.   not until Act , believe  or not,   moment   or   another feat: in   , in a  gown   earrings,  recites   \\" Place I   Hat  Home,\\"    final notes  revealing  of  climactic   makes you   a   even  you\' 44  ... a guy.  plain  great .   out."'
 ',         \'         ,      -     -   \'      .         ,        .       -\'.      young ,                 :  .      ,    , -time ,         performer.       ,     .       performer       ,  Sullivan,        -time               :  .  ,         (  ),               .    ,      (  -           ),             (      ),    ,     -  -- , ,   . < />< />               ,      , ,   . ,      ,    ,        - , \' \'   \',   .\'                      ,        .    , ,    ,       --  ,  -, \\ .\\"       , -  ,           ,          .      ,    ,          :    ,       ,     \\"       ,\\"       revealing                \'   ...  .     .   ."'
 'Well, I  that my review of  special \'t heed    than the others before me, but I literally just watched - during a PBS membership drive-   I\' too   to  . To really appreciate the enigma that is  Streisand,  have to look  before the movies.  the Broadway phenomenon of the -\'s. When  was still a young medium, there was a form of entertainment very prominent  the  that is but a memory today:  variety. Some  shows were  , but others were single, -time , usually  the special  of the individual . This is where  get the ,  first  at Streisand.  had already been a guest    variety shows including  Moore,  Sullivan,  scored a major coup in a -time only tandem   the woman who would pass  the  of belter :  Garland.  1966, COLOR ME  introduced  Streisand in color (hence the title), but copied the  of  first special a    to the letter.  3  ,  get  abstract Streisand (in  -hours  museum  at    the works of ), a comic Streisand working  already adoring  in a studio circus (   fuzzy  furry ),  best of all, a singing Streisand in mini-concert  just-- well, , just doing . <br /><br />It amazes me that she still had the film debut of FUNNY GIRL  to come, as well as turns as , director,  political . , she is barely 24  , doing   because, as she puts  in  own - , \' didn\'t   \'t,   .\' The  museum sequence is shot in Philadelphia over  weekend   the museum  to the    evening,  apparently done  only  color . Yet there are cuts, dissolves,  tracking  ,  in  rather spectacular peak -- the modern,  -flavored, \\Gotta Move.\\"  getting lost amongst the modern abstracts, -club bongos ,  Streisand  in a psychedelic   glittering eye makeup, doing the catchy  tune   androgynous  . It is not   3, believe   not, that the  is matched   by  feat: in the concert sequence, in a white   pearl , Streisand recites the torchy \\"Any  I    is Home,\\" tearing into the final notes    of   belts that   scream  a little girl even  \' 44  ... a guy.  plain  great . Check  out."'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [27]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(dtm_train)

lasso_model = Lasso(alpha=0.00623)

lasso_model.fit(X_train_scaled, train['sentiment'])

# Get the coefficients
lasso_coefs = lasso_model.coef_
print('Lasso Coefficients:', lasso_coefs.sum())

# Create a DataFrame with feature names and their coefficients
feature_coef_df = pd.DataFrame(
    {'Feature': np.array(vectorizer.get_feature_names_out()),
     'Coefficient': lasso_coefs})

selected_features = feature_coef_df[feature_coef_df['Coefficient'] != 0]
vocabulary = list(selected_features['Feature'].values)

with open('./myvocab.txt', 'w') as file:
    # Iterate through the list and write each word to a new line
    for word in vocabulary:
        file.write(word + "\n")

len(selected_features)

Lasso Coefficients: -0.2245395020328098


1206

Setting a seed for consistency in random selection.

In [None]:
random.seed(42)

In [None]:
myvocab = open("myvocab.txt", "r").read().splitlines()
tfidf_vectorizer = TfidfVectorizer(vocabulary=myvocab, ngram_range=(1, 4))
dtm_test = tfidf_vectorizer.fit_transform(dtm_train['review'])
# log_reg.fit(dtm_test, test_y['sentiment'])
# print(myvocab)
# print(test_y)
# print(dtm_test)

# Need to set a seed before selecting samples

positive_reviews = test[test_y['sentiment'] == 1].sample(5, random_state=42)
negative_reviews = test[test_y['sentiment'] == 0].sample(5, random_state=42)
selected_reviews = pd.concat([positive_reviews, negative_reviews])

explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])

def explain_review(review_text):
    print(review_text)
    explanation = explainer.explain_instance(review_text, log_reg.predict_proba, num_features=10)
    
    explanation.show_in_notebook(text=True)

for i, row in selected_reviews.iterrows():
    explain_review(row['review'])
    
