In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import os 

def create_bow():
    directories = ['train/pos', 'train/neg', 'test/pos', 'test/neg'] 

    text = [] 
    score = [] 
    original_score = []

    for directory in directories: 
        print(f'Current directory: {directory}') 
        train_contents = os.listdir(directory)

        print(f'Total reviews in {directory}: {len(train_contents)}')
        
        for i in train_contents: 
            file_name = os.path.basename(i)
            text_file_path = os.path.join(directory, i) 
            review_score = int(i[:len(i)-4].split('_')[1])

            original_score.append(review_score) 
            
            if review_score >= 7: 
                review_score = 1
            elif review_score <= 4: 
                review_score = 0
            else: 
                continue # don't consider reviews that are neutral rated  
            
            with open(text_file_path, 'r', encoding='utf-8') as f: 
                text_review = f.readline() 

                text.append(text_review)
                score.append(review_score)
        
        print(f'Finished {directory}\n') 

    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1)) 
    total_words = vectorizer.fit_transform(text) 

    print(f'Total data shape: {total_words.shape}')
    print('Finished running.') 
    
    return total_words, score, original_score, text

X, y, score, text = create_bow() 

# X is total vectorized words, removing stop words and 1-gram 
# y is modified based on scores -> 1 and 0 
# score is original scores
# text is array of all reviews 

# TRAIN DATA : total 25_000 -> 12_500 pos and 12_500 neg
# TEST DATA : total 25_000 -> 12_500 pos and 12_500 neg 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Logistic Classifier

# Uses test + train data and splits 25/75 split to create new train + test data set.

def logistic_classifier(X, y): 
    seed = 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed) 
    
    classifier = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    classifier.fit(X_train, y_train) 
    
    test_accuracy = classifier.score(X_test, y_test) 
    train_accuracy = classifier.score(X_train, y_train) 
    
    print('\nTesting accuracy:', format( 100*test_accuracy , '.2f') )
    print('\nTraining accuracy:', format( 100*train_accuracy , '.2f') )

    return classifier

logistic_classifier(X, y) 

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Accuracy vs Regularization of Logistic Classifier

# evaluating test + train accuracy on different regularization strenghts
seed = 42

def accuracy_vs_regular(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed) 

    coefs = [] 
    train_acc = [] 
    test_acc = [] 
    cs = [0, 0.1, 1, 10, 50, 100, 1000] 
    
    for c in cs: 
        if c == 0: 
            classifier = LogisticRegression(penalty=None, fit_intercept=True, max_iter=1000) 
        else: 
            classifier = LogisticRegression(penalty='l1', solver='liblinear', C=c, fit_intercept=True, max_iter=1000)
            
        classifier.fit(X_train, y_train) 
        
        coefs.append(classifier.coef_)
        
        train_acc.append(classifier.score(X_train, y_train))
        test_acc.append(classifier.score(X_test, y_test))
        
    fix, axes = plt.subplots() 
    
    axes.semilogx(cs, train_acc, color='red', label='training accuracy')
    axes.semilogx(cs, test_acc, color='blue', label='testing accuracy')

    axes.set_xlabel('Regularization Strength', fontsize=13)
    axes.set_ylabel('Accuracy', fontsize=13)
    
    axes.legend()
    plt.grid(True)

    return train_acc, test_acc, coefs
    
train_acc, test_acc, coefs = accuracy_vs_regular(X, y)
    