In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import os
import pickle
import string
import numpy as np
import re
import torch
# import spacy
# from spacy.matcher import Matcher


In [2]:
data_dir = os.path.join('data_reviews/') 
x_train = pd.read_csv(data_dir+'x_train.csv')
y_train = pd.read_csv(data_dir+'y_train.csv')['is_positive_sentiment']

In [19]:

import nltk
from nltk.stem import PorterStemmer

nltk.download('wordnet')
def fitCountVectorizer(texts):
    processed_texts = [text[1].lower() for text in texts]
    translator = str.maketrans('', '', string.punctuation)
    processed_texts = [text.translate(translator) for text in processed_texts]
    stemmer = PorterStemmer()
    processed_texts = [[stemmer.stem(word) for word in text] for text in processed_texts]
    processed_texts = ["".join(text) for text in processed_texts]
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=1, binary=False) 
    vectorizer.fit(processed_texts)
    return vectorizer

texts = x_train.values.tolist()
vectorizer = fitCountVectorizer(texts)
filename = 'vectorizer.pkl'
with open(filename, 'wb') as file:
    pickle.dump(vectorizer, file)






[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johannzhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:

from sklearn.preprocessing import MaxAbsScaler

def extract_BoW_features(texts):
    processed_texts = [text[1].lower() for text in texts]
    translator = str.maketrans('', '', string.punctuation)
    processed_texts = [text.translate(translator) for text in processed_texts]
    stemmer = PorterStemmer()
    processed_texts = [[stemmer.stem(word) for word in text] for text in processed_texts]
    processed_texts = ["".join(text) for text in processed_texts]
    with open ('./vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    features = vectorizer.transform(processed_texts)
    return features.toarray()


In [21]:
x_train_features = extract_BoW_features(x_train.values.tolist())

In [22]:
x_train_features.shape




(2400, 4424)

In [26]:
param_grid = {
    'solver': ['liblinear'],
    'penalty': ['l2', 'l1'],
    'C' : np.logspace(-3, 2, 20),
    'tol': [1e-10, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3],
}

In [27]:

from sklearn.model_selection import StratifiedKFold
n_splits = 10
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


In [29]:
from sklearn.utils import parallel_backend

model1 = LogisticRegression(max_iter=1000)

random_search = RandomizedSearchCV(model1,
                                   param_distributions=param_grid,
                                   cv=stratified_kfold,
                                   n_iter=100,
                                   scoring='roc_auc')
with parallel_backend('threading'):
    random_search.fit(x_train_features, y_train)



In [30]:
print(' ')
print("Best Hyperparameters:", random_search.best_params_)
print(' ')
print("Best Score:", random_search.best_score_)

 
Best Hyperparameters: {'tol': 0.001, 'solver': 'liblinear', 'penalty': 'l2', 'C': 2.636650898730358}
 
Best Score: 0.8852187499999999


In [31]:
x_train.shape

(2400, 2)

In [170]:
x_train_features.shape

(2400, 2573)

In [171]:
x_test_df.shape

(600, 2)

In [172]:
te_text_features.shape

(600, 2573)

In [32]:
best_model = Pipeline([
        ('classifer', LogisticRegression(**random_search.best_params_)
)
])

best_model.fit(x_train_features, y_train)

# y_hat = best_model.predict(te_text_features)

# y_hat

filename = 'best_model1.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)




# accuracy = best_model.score(X_test, y_test)
