# Naive Bayes

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer
import string
from numpy.random import permutation
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv("dataset/ParlVote/ParlVote_concat.csv")

In [3]:
df = df.drop(list(set(df.columns) - set(["speech", "vote"])), axis = 1)

In [4]:
def preprocess_data(data):
    # remove punctuation
    data["speech"] = data["speech"].str.translate(str.maketrans('', '', string.punctuation))
    return data

In [5]:
# because the entire dataset didn't fit into memory
def sample_small_balanced_dataset(n=10000):
    global data
    data_0 = data[data["vote"] == 0]
    data_1 = data[data["vote"] == 1]
    data_0 = data_0.iloc[permutation(len(data_0))[:int(n/2)]]
    data_1 = data_1.iloc[permutation(len(data_1))[:int(n/2)]]
    data = pd.concat([data_0, data_1])
    return data

In [6]:
data = preprocess_data(df)

In [15]:
small_dataset_size = 20000
data = sample_small_balanced_dataset(small_dataset_size)

x_train, x_test, y_train, y_test = train_test_split(data["speech"], data["vote"], stratify=data["vote"], test_size=0.25, random_state=42)

# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()

model = MultinomialNB()
model.fit(x_train, y_train)

print("Dataset size:", small_dataset_size, "Test Accuracy:", model.score(x_test, y_test))

Dataset size: 20000 Test Accuracy: 0.5672
