# Baseline Model

## Step 1: Import packages

In [16]:
import pandas as pd
import numpy as np
import csv
import copy
from datasets import load_dataset

# data preprocessing
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import IncrementalPCA
from numpy.random.mtrand import binomial
import random
import string
from nltk.corpus import stopwords
from contractions import fix

# exploratory analysis
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import mlxtend
from mlxtend.plotting import scatterplotmatrix
from mlxtend.plotting import heatmap
import seaborn as sns
from IPython.display import Image
from textblob import TextBlob
from wordcloud import WordCloud

# model fit
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
#import tensorflow as tf
#from tensorflow import keras
#from keras import metrics
#from tensorflow.keras import initializers


# ignore warnings (libraries are rapidly changing)
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")

# These commands below set some options for pandas and to have matplotlib show the charts in the notebook
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format
pd.options.mode.chained_assignment = None  # default='warn'

#### Step 2: Read Data

In [17]:
file_path = '../EDA/binary_sampled.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [18]:
X = df.iloc[:, 1].values
y = df.iloc[:, 0].values 

#### Step 3: Create and Train Model

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
vectorizer = CountVectorizer(max_features=5000)

X_train = X_train.tolist()
X_test = X_test.tolist()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_bow, y_train)

y_pred = clf.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      3275
           1       0.87      0.88      0.87      3286

    accuracy                           0.87      6561
   macro avg       0.87      0.87      0.87      6561
weighted avg       0.87      0.87      0.87      6561



In [20]:
print(X_train_bow.shape)
for text, prediction, actual in zip(X_test, y_pred, y_test):
    print(f'Text: "{text}" -> Prediction: {prediction} -> Actual: {actual}')

(26244, 10000)
Text: "do not go here thinking it will be like the movie i went on a friday and could hardly walk because they let so many people in the place is really small people will be bumping into you the whole time and you cant really move around i am all for loud music but they have it so loud that you cant even here your buddy next to you talk drinks are about average for the strip around but thats if you can get to the bar the girls are cute but definitely second rate by las vegas standards also if you throw a or dollar bill on the bar dont expect change might be fun for tourists on like a tuesday or something locals dont bother" -> Prediction: 0 -> Actual: 0
Text: "this place is the opposite of fast food drive thru restaurant the food is good thought but everything else everything else sucks here including the furniture and lousy employees are they even food handler certifiednthis place manager or owner needs to take note of how slow they are to serve you the food nthey have 