# 1. Vectorizing raw data: TF-IDF

**TF-IDF**

Creates a document-term matrix where the columns represent single unique terms (unigrams) but the cell represents a weighting meant to represent how important a word is to a document.

In [0]:
# how to upload files in Google drive and load in Colab Notebook
from google.colab import drive
drive.mount("pre_processed_comments.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv("pre_processed_comments.csv")
data.head()

Unnamed: 0,target,comment_text_lemmatized
0,0.0,"['cool', 'like', 'would', 'want', 'mother', 'read', 'really', 'great', 'idea', 'well', 'done']"
1,0.0,"['thank', 'would', 'make', 'life', 'lot', 'le', 'anxietyinducing', 'keep', 'dont', 'let', 'anyon..."
2,0.0,"['urgent', 'design', 'problem', 'kudos', 'taking', 'impressive']"
3,0.0,"['something', 'ill', 'able', 'install', 'site', 'releasing']"
4,0.893617,"['haha', 'guy', 'bunch', 'loser']"


In [0]:
# done! don't do it again

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(data['comment_text_lemmatized'])
print(X_tfidf.shape)
#print(tfidf_vect.get_feature_names())

(1804874, 606027)


In [0]:
# done! don't do it again

import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(data['comment_text_lemmatized'])

filename = "drive/My Drive/Colab Notebooks/tfidf.sav"
joblib.dump(tfidf_vect, filename)

['drive/My Drive/Colab Notebooks/tfidf.sav']

In [0]:

import joblib
filename = "drive/My Drive/Colab Notebooks/tfidf.sav"
tfidf_vect_loaded = joblib.load(filename)
X_tfidf = tfidf_vect_loaded.transform(data['comment_text_lemmatized'])
print(X_tfidf.shape)



(1804874, 606027)


## What TF-IDF did... (test with small sample)

In [0]:
# don't execute. This is just demonstration

from sklearn.feature_extraction.text import TfidfVectorizer
data_sample = data[0:10]

tfidf_vect_sample = TfidfVectorizer()
X_tfidf_sample = tfidf_vect_sample.fit_transform(data_sample['comment_text_lemmatized'])
print(X_tfidf_sample.shape)
print(tfidf_vect_sample.get_feature_names())

X_tfidf_df = pd.DataFrame(X_tfidf_sample.toarray())
X_tfidf_df.columns = tfidf_vect_sample.get_feature_names()
X_tfidf_df

(10, 63)
['able', 'allow', 'animal', 'anxietyinducing', 'anyone', 'bunch', 'combo', 'comment', 'cool', 'design', 'destroy', 'done', 'dont', 'expected', 'ffffuuuuuuuuuuuuuuu', 'get', 'good', 'great', 'greed', 'guy', 'haha', 'hahahahahahahahhha', 'id', 'idea', 'ill', 'impressive', 'install', 'keep', 'kudos', 'land', 'le', 'let', 'life', 'like', 'loser', 'lot', 'make', 'mostly', 'mother', 'motivated', 'one', 'problem', 'public', 'rancher', 'read', 'really', 'releasing', 'right', 'seem', 'show', 'shtty', 'site', 'something', 'suck', 'taking', 'thank', 'together', 'ur', 'urgent', 'want', 'way', 'well', 'would']


Unnamed: 0,able,allow,animal,anxietyinducing,anyone,bunch,combo,comment,cool,design,...,suck,taking,thank,together,ur,urgent,want,way,well,would
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309414,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.309414,0.0,0.309414,0.26303
1,0.0,0.0,0.0,0.280357,0.280357,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.280357,0.0,0.0,0.0,0.0,0.280357,0.0,0.238329
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,...,0.0,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0
3,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,...,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.288675,0.288675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.385682,0.0,0.0,0.0,...,0.0,0.0,0.0,0.385682,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Building Machine Learning Model

## Naive Bayes

hyperparameters




In [0]:
import joblib
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X_tfidf_dense = X_tfidf.toarray()

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_dense, data["target"], test_size=0.2, random_state=123)


gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: {}".format(rmse))

score = gnb.score(X_test, y_test)
print("Score: {}".format(score))

filename = "drive/My Drive/Colab Notebooks/model_NB.sav"
joblib.dump(gnb, filename)


### Grid search

In [0]:
import joblib

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data["target"], test_size=0.2, random_state=123)

max_depth = [30, 80, 120]
n_estimators = [50, 100, 150]

for depth in max_depth:
  for n_estimator in  n_estimators:

    xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.2, max_depth = depth, alpha = 10, n_estimators = n_estimator)

    xg_reg.fit(X_train,y_train)
    y_pred = xg_reg.predict(X_test)

    print("parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = {}, alpha = 10, n_estimators = {}".format(depth, n_estimator))

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE: {}".format(rmse))

    score = xg_reg.score(X_test, y_test)
    print("Score: {}".format(score))
    
    filename = "drive/My Drive/Colab Notebooks/model_" + "depth_"+ str(depth) + "n_estimator_"+ str(n_estimator) +".sav"
    joblib.dump(xg_reg, filename)

NameError: ignored

## Real Test