# 1. Vectorizing raw data: TF-IDF

**TF-IDF**

Creates a document-term matrix where the columns represent single unique terms (unigrams) but the cell represents a weighting meant to represent how important a word is to a document.

In [0]:
import pandas as pd
import numpy
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv("pre_processed_comments_concise.csv")
data.head()

In [0]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(data['comment_text_lemmatized'])

filename = "tfidf.sav"
joblib.dump(tfidf_vect, filename)

tfidf_vect_loaded = joblib.load(filename)
X_tfidf = tfidf_vect_loaded.transform(data['comment_text_lemmatized'])
print(X_tfidf.shape)

## 2. Building Machine Learning Model

## Naive Bayes

hyperparameters




In [0]:
import joblib
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_tfidf_dense = X_tfidf.toarray()

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_dense, data["target"], test_size=0.2, random_state=123)

gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: {}".format(rmse))

score = gnb.score(X_test, y_test)
print("Score: {}".format(score))

filename_model = "model_NB.sav"
joblib.dump(gnb, filename_model)
