In [1]:
import pandas as pd
import pyarrow
import os
import pickle
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
data_dir = os.path.join('data_reviews/') 
x_train = pd.read_csv(data_dir+'x_train.csv')
y_train = pd.read_csv(data_dir+'y_train.csv')['is_positive_sentiment']

In [3]:
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import MaxAbsScaler

def extract_BoW_features2(texts):
    processed_texts = [text[1] for text in texts]  
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    model = BertModel.from_pretrained('bert-base-uncased')

    inputs = tokenizer(processed_texts, padding=True, truncation=True, return_tensors='pt', max_length=1024)

    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    features = torch.mean(embeddings, dim=1)
    features_numpy = features.numpy()
    scaler = MaxAbsScaler()
    features_numpy = scaler.fit_transform(features_numpy)
    return features_numpy



In [4]:
x_train_features = extract_BoW_features2(x_train.values.tolist())


In [5]:
x_train_features.shape

(2400, 768)

In [6]:
param_grid = {
    'solver': ['liblinear', 'lbfgs', 'newton-cholesky'],
    'penalty': ['l2', 'l1'],
    'C' : np.logspace(-3, 2, 20),
    'tol': [1e-10, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3],
}

In [7]:
from sklearn.model_selection import StratifiedKFold
n_splits = 10
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [8]:
from sklearn.utils import parallel_backend

model1 = LogisticRegression(max_iter=1000)

random_search = RandomizedSearchCV(model1,
                                   param_distributions=param_grid,
                                   cv=stratified_kfold,
                                   n_iter=100,
                                   scoring='roc_auc')
with parallel_backend('threading'):
    random_search.fit(x_train_features, y_train)

  self.coef_newton = scipy.linalg.solve(
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.03903e-08): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.05521e-08): result may not be accurate.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.04587e-08): result may not be accurate.
  self.coef_newton = scipy.linalg.solve(
255 fits failed out o

In [9]:
print(' ')
print("Best Hyperparameters:", random_search.best_params_)
print(' ')
print("Best Score:", random_search.best_score_)

 
Best Hyperparameters: {'tol': 0.001, 'solver': 'liblinear', 'penalty': 'l2', 'C': 0.0379269019073225}
 
Best Score: 0.9689861111111112


In [10]:
best_model = Pipeline([
        ('classifer', LogisticRegression(**random_search.best_params_))
])

best_model.fit(x_train_features, y_train)

filename = 'best_model2.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)
