In [31]:
import csv
from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [32]:
def create_bow_from_reviews(filename):
    text = []
    Y = []

    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            review = row[1]
            stars = row[0]
            text.append(review)
            if stars == '1':
                Y.append(0)
            else:
                Y.append(1)

    print('Creating Vectorizer....')
    # create an instance of a TF-IDFVectorizer, using
    # (1) the standard 'english' stopword set
    # (2) only keeping terms in the vocabulary that occur in at least 1% of documents
    # (3) allowing both unigrams and bigrams in the vocabulary (use "ngram_range=(1,2)" to do this)
    vectorizer = TfidfVectorizer()

    # create a sparse BOW array from 'text' using vectorizer
    X = vectorizer.fit_transform(text)

    print('Data shape: ', X.shape)

    # you can uncomment this next line if you want to see the full list of tokens in the vocabulary
    # print('Vocabulary: ', vectorizer.get_feature_names())
    return X, Y, vectorizer

In [33]:
def logistic_classification(X, Y, test_fraction):
    print('\nLogistic Classification:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    print('Number of training examples: ', X_train.shape[0])
    print('Number of testing examples: ', X_test.shape[0])
    print('Vocabulary size: ', X_train.shape[1])

    # Specify the logistic classifier model
    classifier = linear_model.LogisticRegression()

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('Training a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    print('\nTraining:')
    train_accuracy = classifier.score(X_train, Y_train)
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    class_probabilities = classifier.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1]);
    print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)


In [34]:
def support_vector_machine(X, Y, test_fraction):
    print('\nSupport Vector Machine:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    # Specify the logistic classifier model
    classifier = SVC(probability=True)

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('Training a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    print('\nTraining:')
    train_accuracy = classifier.score(X_train, Y_train)
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    class_probabilities = classifier.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1]);
    print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)

In [35]:
def linear_support_vector_machine(X, Y, test_fraction):
    print('\nLinear Support Vector Machine:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    # Specify the logistic classifier model
    classifier = LinearSVC()

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('Training a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    print('\nTraining:')
    train_accuracy = classifier.score(X_train, Y_train)
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    # class_probabilities = classifier.predict_proba(X_test)
    # test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1]);
    # print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)


In [36]:
def DecisionTree_classification(X, Y, test_fraction):
    # should add comments defining what the inputs are what the function does
    print('\nDecisionTree:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    print('Number of training examples: ', X_train.shape[0])
    print('Number of testing examples: ', X_test.shape[0])
    print('Vocabulary size: ', X_train.shape[1])

    # Specify the logistic classifier model with an l2 penalty for regularization and with fit_intercept turned on
    classifier = DecisionTreeClassifier(criterion="entropy",random_state=0)

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('\nTraining a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    train_predictions = classifier.predict(X_train)
    train_accuracy = classifier.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_predictions = classifier.predict(X_test)
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    class_probabilities = classifier.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1])
    print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)

In [37]:
def Random_Forest_Classifier(X, Y, test_fraction):
    print('\nRandom_Forest_Classifier:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    # Specify the logistic classifier model
    classifier = RandomForestClassifier(n_estimators=13)

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('Training a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    print('\nTraining:')
    train_accuracy = classifier.score(X_train, Y_train)
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    # class_probabilities = classifier.predict_proba(X_test)
    # test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1]);
    # print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)

In [38]:
def Naive_Bayes_Classifier(X, Y, test_fraction):
    print('\nNaive_Bayes_Classifier:')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code

    # Specify the logistic classifier model
    classifier = MultinomialNB()

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('Training a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    print('\nTraining:')
    train_accuracy = classifier.score(X_train, Y_train)
    print(' accuracy:', format(100 * train_accuracy, '.2f'))

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format(100 * test_accuracy, '.2f'))

    # class_probabilities = classifier.predict_proba(X_test)
    # test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities[:, 1]);
    # print(' AUC value:', format(100 * test_auc_score, '.2f'))

    return (classifier)

In [39]:
def most_significant_terms(classifier, vectorizer, K):
    # find the largest K positive/negative weights' INDICES!
    coefs = classifier.coef_[0]
    topK_pos_indices = np.argsort(coefs)[-K:];
    topK_neg_indices = np.argsort(coefs)[0:K];

    topK_pos_weights = []
    topK_neg_weights = []
    topK_pos_terms = []
    topK_neg_terms = []

    # cycle through the indices, in the order of largest weight first
    # 1) append the weight and term to lists
    # 2) print K lines:
    #     (a) the term corresponding to the weight (a string)
    #     (b) the weight value itself (a scalar printed to 3 decimal places)
    print('Most significant positive terms & weight:')
    for i in topK_pos_indices[::-1]:
        weight = coefs[i]
        term = vectorizer.get_feature_names()[i]
        topK_pos_weights.append(weight)
        topK_pos_terms.append(term)
        print('term: {:<15}, weight = {:.4f}'.format(term, weight))

    print('Most significant negative terms & weight:')
    for i in topK_neg_indices:
        weight = coefs[i]
        term = vectorizer.get_feature_names()[i]
        topK_neg_weights.append(weight)
        topK_neg_terms.append(term)
        print('term: {:<15}, weight = {:.4f}'.format(term, weight))

    return (topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms)

In [40]:
X, Y, vectorizer_BOW = create_bow_from_reviews('train.csv')
test_fraction = 0.5
logistic_classifier = logistic_classification(X, Y, test_fraction)

Creating Vectorizer....
Data shape:  (560000, 224898)

Logistic Classification:
Number of training examples:  280000
Number of testing examples:  280000
Vocabulary size:  224898
Training a model with 280000 examples.....

Training:
 accuracy: 94.40

Testing: 
 accuracy: 93.21
 AUC value: 98.18


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
DecisionTree_classifier = DecisionTree_classification(X, Y,test_fraction)


DecisionTree:
Number of training examples:  280000
Number of testing examples:  280000
Vocabulary size:  224898

Training a model with 280000 examples.....


In [None]:
LSVC = linear_support_vector_machine(X, Y, test_fraction)

In [None]:
RandomForestClassifier = Random_Forest_Classifier(X, Y,test_fraction)

In [None]:
NaiveBayesClassifier = Naive_Bayes_Classifier(X, Y,test_fraction)