In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
import os
import pickle
import string
import numpy as np
import re
import spacy
from spacy.matcher import Matcher

data_dir = os.path.join('data_reviews/') 
x_train = pd.read_csv(data_dir+'x_train.csv')

y_train = pd.read_csv(data_dir+'y_train.csv')['is_positive_sentiment']
nlp = spacy.load('en_core_web_sm')

# Add custom stop words (example)
custom_stop_words = ["a", "an", "the","I", "you", "he", "she", "it", "we", "they","and", "but", "or", "so", "yet","in", "on", "at", "of", "to", "from", "with","is", "are", "was", "were", "be", "been", "being","because", "as", "until", "while", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
for word in custom_stop_words:
    nlp.vocab[word].is_stop = True

def processed_reviews(reviews):
    processed = list()
    for review in reviews:
        review = review.lower()
        review = re.sub(r"[^a-zA-Z'\s]", '', review)
        review = clean_negation(review)
        doc = nlp(review)
        lemmatized = [token.lemma_ for token in doc if not token.is_stop and token.pos_ in ['NOUN', 'ADJ']]
        lemmatized = [word for word in lemmatized if word.strip()]
        processed.append(' '.join(lemmatized))
    return processed

def clean_negation(text):
    doc = nlp(text)
    count = 0
    for token in doc:
        if token.dep_ == 'neg':
            text = text.replace(token.text, '')
            count += 1   
    if count % 2 == 1:
        text = text + ' negation'
    return text

def extract_BoW_features(texts):
    processed_texts = [text[1] for text in texts]
    processed_texts = processed_reviews(processed_texts)
    vectorizer = CountVectorizer(stop_words='english', max_df=0.5, min_df=1, binary=True)
    features = vectorizer.fit_transform(processed_texts)
    feature_names = vectorizer.get_feature_names_out()
    document_features = [" ".join([feature_names[word_index] for word_index in doc_vector.indices]) for doc_vector in features]
    return np.array(document_features)

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(max_iter=100000))
])

param_grid = {
    'vectorizer__max_features': [1000, 2000, 3000, 4000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__binary': [True, False],
    'vectorizer__min_df': [1, 2, 3, 4, 5],
    'vectorizer__max_df': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs', 'sag', 'saga']
}

# Use RandomizedSearchCV for hyperparameter tuning
random_search = GridSearchCV(pipeline, param_grid, cv=5)
random_search.fit(x_train['text'], y_train)

best_params = random_search.best_params_
print(best_params)

classifier1 = Pipeline([
    ('vectorizer', CountVectorizer(
        max_features=best_params['vectorizer__max_features'],
        ngram_range=best_params['vectorizer__ngram_range'],
        binary=best_params['vectorizer__binary'],
        min_df=best_params['vectorizer__min_df'],
        max_df=best_params['vectorizer__max_df']
    )),
    ('classifier', LogisticRegression(
        C=best_params['classifier__C'],
        solver=best_params['classifier__solver'],
        max_iter=100000
    ))
])

classifier1.fit(x_train['text'], y_train)

def predict_proba(features):
    return classifier1.predict_proba(features)

X_train, X_test, y_train, y_test = train_test_split(x_train['text'], y_train, test_size=0.3, random_state=42)
classifier1.fit(X_train, y_train)

accuracy = classifier1.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

filename = 'best_model1.pkl'
with open(filename, 'wb') as file:
    pickle.dump(classifier1, file)
