# 1. Vectorizing raw data: TF-IDF

**TF-IDF**

Creates a document-term matrix where the columns represent single unique terms (unigrams) but the cell represents a weighting meant to represent how important a word is to a document.

In [0]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv("pre_processed_comments_concise.csv")
data.head()

Unnamed: 0,target,comment_text_lemmatized
0,0.0,"['cool', 'like', 'would', 'want', 'mother', 'read', 'really', 'great', 'idea', 'well', 'done']"
1,0.0,"['thank', 'would', 'make', 'life', 'lot', 'le', 'anxietyinducing', 'keep', 'dont', 'let', 'anyon..."
2,0.0,"['urgent', 'design', 'problem', 'kudos', 'taking', 'impressive']"
3,0.0,"['something', 'ill', 'able', 'install', 'site', 'releasing']"
4,0.893617,"['haha', 'guy', 'bunch', 'loser']"


In [0]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(data['comment_text_lemmatized'])

filename = "tfidf.sav"
joblib.dump(tfidf_vect, filename)

(1804874, 606027)


## 2. Building Machine Learning Model

## Extreme Gradient Boosting

XGBoost's hyperparameters

- **learning_rate**: step size shrinkage used to prevent overfitting. Range is [0,1]
- **max_depth**: determines how deeply each tree is allowed to grow during any boosting round.
- **subsample**: percentage of samples used per tree. Low value can lead to underfitting.
- **colsample_bytree**: percentage of features used per tree. High value can lead to overfitting.
- **n_estimators**: number of trees you want to build.
- **objective**: determines the loss function to be used like <br>
  &nbsp;&nbsp; *reg:linear* for regression problems<br>
  &nbsp;&nbsp; *reg:logistic* for classification problems with only decision <br>
  &nbsp;&nbsp; *binary:logistic* for classification problems with probability<br>
  
  
 - **gamma**: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
-**alpha**: L1 regularization on leaf weights. A large value leads to more regularization.
-**lambda**: L2 regularization on leaf weights and is smoother than L1 regularization.

### Single model

In [0]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data["target"], test_size=0.2, random_state=123)

start_time  = datetime.datetime.now()
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.3, max_depth = 120, alpha = 10, n_estimators = 150)

xg_reg.fit(X_train,y_train)
y_pred = xg_reg.predict(X_test)

print("parameters: colsample_bytree = 0.3, learning_rate = 0.3, max_depth = 120, alpha = 10, n_estimators = 150")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: {}".format(rmse))

score = xg_reg.score(X_test, y_test)
print("Score: {}%".format(score))

end_time = datetime.datetime.now()
print('Select Done..., Time Cost: {}'.format((end_time - start_time).seconds))

filename = "model_" + "depth_120"+"n_estimator_150"+".sav"
joblib.dump(xg_reg, filename)

  if getattr(data, 'base', None) is not None and \


### Grid search

In [0]:
import joblib

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data["target"], test_size=0.2, random_state=123)

max_depth = [30, 80, 120]
n_estimators = [50, 100, 150]

for depth in max_depth:
  for n_estimator in  n_estimators:

    xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.2, max_depth = depth, alpha = 10, n_estimators = n_estimator)

    xg_reg.fit(X_train,y_train)
    y_pred = xg_reg.predict(X_test)

    print("parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = {}, alpha = 10, n_estimators = {}".format(depth, n_estimator))

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("RMSE: {}".format(rmse))

    score = xg_reg.score(X_test, y_test)
    print("Score: {}".format(score))
    
    filename = "model_" + "depth_"+ str(depth) + "n_estimator_"+ str(n_estimator) +".sav"
    joblib.dump(xg_reg, filename)

  if getattr(data, 'base', None) is not None and \


parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = 30, alpha = 10, n_estimators = 50
RMSE: 0.13721690296759934
Score: 0.517773667350127

parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = 30, alpha = 10, n_estimators = 100
RMSE: 0.13398713286524683
Score: 0.5402074988799477

parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = 30, alpha = 10, n_estimators = 150
RMSE: 0.1328326897959838
Score: 0.5480965779349566

parameters: colsample_bytree = 0.3, learning_rate = 0.2, max_depth = 80, alpha = 10, n_estimators = 50
RMSE: 0.13574270172780437
Score: 0.5280796842133185


In [0]:
import datetime
from sklearn.model_selection import GridSearchCV

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10)
parameters = {'n_estimators': [50, 100], 'max_depth':[30, 90]}

start_time  = datetime.datetime.now()

grid_search = GridSearchCV(estimator=xg_reg, param_grid=parameters, cv=5, n_jobs=-1)
grid_search.fit(X_tfidf, data["target"])

print("Best score: {}".format(grid_search.best_score_))
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
  print("\t{}: {}".format(param_name, best_parameters[param_name]))
    
end_time = datetime.datetime.now()
print('Select Done..., Time Cost: {}'.format((end_time - start_time).seconds))  