In [None]:
import pandas as pd

# load grouped data
src = 'data/training_data_grouped.csv'
training_data_raw = pd.read_csv(src)

src = 'data/validation_data_grouped.csv'
validation_data_raw = pd.read_csv(src)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer

# convert to numpy array
col = 'content_stopword'
X_train = training_data_raw[col]
X_val = validation_data_raw[col]

# hashing
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html
# hv = HashingVectorizer(n_features=2**26)
# X_train_hv = hv.fit_transform(X_train)

# bag of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
vectorizer = CountVectorizer(max_features=100)
X_train_cnts = vectorizer.fit_transform(X_train)
X_val_cnts = vectorizer.fit_transform(X_val)

# tf-idf
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_cnts)
X_val_tfidf = tfidf_transformer.fit_transform(X_val_cnts)

In [None]:
Y_train = pd.get_dummies(training_data_raw['type'], drop_first=True)
Y_val = pd.get_dummies(validation_data_raw['type'], drop_first=True)

In [None]:
# NAIVE BAYES MODEL
# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

x_train = X_train_tfidf.toarray()
y_train = np.array(Y_train).ravel()

x_val = X_val_tfidf.toarray()
y_val = np.array(Y_val).ravel()

# naive bayes model
clf = MultinomialNB().fit(x_train, y_train)

# predictions
y_pred = clf.predict(x_val)
print(classification_report(y_val, y_pred))


In [None]:
# LOGISTIC REGRESSION MODEL

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

x_train = X_train_tfidf.toarray()
y_train = np.array(Y_train).ravel()

x_val = X_val_tfidf.toarray()
y_val = np.array(Y_val).ravel()

# create logistic reg. model, and train it
model = LogisticRegression()
model.fit(x_train, y_train)

# test the model and report performance
predictions = model.predict(x_val)
print(classification_report(y_val, predictions))