<h2>XGBOOST with Doc2Vec word embedding</h2>

<h3>Contents of this file</h3>

1. XGBoost basic model

2. Hyper-parameter tuning

3. Cross validation of final model

4. Micro-scopic analysis

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [4]:
norm_train_data = pd.read_pickle("norm_train_data.uu")
norm_test_data = pd.read_pickle("norm_test_data.uu")

In [5]:
train_labels = norm_train_data['label'].astype(int)
test_labels = norm_test_data['label'].astype(int)

In [6]:
print('shape of normalized train data: {}'.format(norm_train_data.shape))
print('current columns:\n{}'.format(norm_train_data.columns))

shape of normalized train data: (807647, 36)
current columns:
Index(['index', 'label', 'comment', 'author', 'subreddit', 'score', 'ups',
       'downs', 'date', 'created_utc', 'parent_comment', 'word_count',
       'punctuation_count', 'has_repeated', 'exclaim_count', 'qns_mark_count',
       'ellipses_mark_count', 'interjection_count', 'laughter_words_count',
       'capitalized_word_count', 'partial_capital_word_count',
       'emoticon_count', 'clean_comment', 'lemmatized_comment',
       'lemmatized_parent_comment', 'clean_parent_comment',
       'lemmatized_clean_comment', 'clean_lemmatized_parent_comment',
       'vectorized_comment', 'vectorized_clean_comment', 'vectorized_parent',
       'vectorized_clean_parent', 'cosine_similarity_dirty_comments',
       'cosine_similarity_clean_comments', 'comment_sentiment',
       'parent_comment_sentiment'],
      dtype='object')


In [7]:
norm_train_data.describe()

Unnamed: 0,index,label,score,ups,downs,word_count,punctuation_count,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,cosine_similarity_dirty_comments,cosine_similarity_clean_comments
count,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0,807647.0
mean,505342.9,0.50035,6.912703,5.5296,-0.146174,0.004264,0.000212,0.012556,0.002202,0.013006,0.004861,0.008823,0.00039,0.000141,8.3e-05,0.001157,0.51844,0.510574
std,291840.8,0.5,47.521331,41.687109,0.353281,0.004743,0.001167,0.111349,0.006939,0.035844,0.019006,0.03818,0.00287,0.002421,0.001425,0.010118,0.107341,0.108868
min,0.0,0.0,-507.0,-507.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,252575.5,0.0,1.0,0.0,0.0,0.001801,0.000102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446967,0.437965
50%,505325.0,1.0,2.0,1.0,0.0,0.003602,0.000204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516835,0.50814
75%,758079.5,1.0,4.0,3.0,0.0,0.005853,0.000306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59004,0.582497
max,1010824.0,1.0,5818.0,5163.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
def metrics(test_data, prediction):
    cm = pd.DataFrame(confusion_matrix(test_data, prediction))
    cm.columns = ['Predicted Y=0','Predicted Y=1']
    cm.index = ['True Y=0','True Y=1']
    display(cm)

    accuracy = (cm.iloc[0,0]+cm.iloc[1,1])/(cm.iloc[0,0]+cm.iloc[1,1]+cm.iloc[0,1]+cm.iloc[1,0])
    print('Accuracy: '+str(accuracy))

    # Possible ways to further improve accuracy is to try adjusting the level of significance.

    # Calculate Sensitivity (true positive rate)
    sensitivity = cm.iloc[1, 1]/(cm.iloc[1, 1] + cm.iloc[1, 0])
    print('Sensitivity: '+ str(sensitivity))

    # Calculate Specificity (true negative rate)
    specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[0, 1])
    print('Specificity: '+str(specificity))

    #Precision
    precision = cm.iloc[1,1]/(cm.iloc[1,1]+cm.iloc[0,1])
    print('Precision: '+str(precision))

    # AUC
    print('ROC-AUC:',roc_auc_score(test_data, prediction))

    #F1 Score
    print("F1 score:", round(f1_score(test_data, prediction), 4))

<h3>XGBoost with only the dirty vectorized comments</h3>


In [9]:
train_dirty_comments = norm_train_data.loc[:, ['label', 'vectorized_comment',]].copy()

In [10]:
train_dirty_comments.head()

Unnamed: 0,label,vectorized_comment
0,0,"[-0.23815727, 0.0650364, 0.274105, 0.47422692,..."
1,0,"[-0.06878692, 0.64647627, -0.0010952452, 0.650..."
2,1,"[-0.10693383, 0.044041224, 0.08560472, 0.07766..."
3,1,"[-0.24483381, -0.35850817, 0.5864438, -0.03253..."
4,1,"[-0.16459303, 0.44265932, 0.76053095, 0.579429..."


In [11]:
train_subset = train_dirty_comments.iloc[0:10, :].copy()

In [12]:
train_subset['vectorized_comment'][0][0]

-0.23815727

In [13]:
def convert_array_to_cols(df, vec_dim):
    for row_num in df['vectorized_comment'].index:
        for i in range(vec_dim):
            df.loc[row_num, 'vec_dim{}'.format(i)] = df.loc[row_num, 'vectorized_comment'][i]
        
convert_array_to_cols(train_subset, 100)

NameError: name 'vec_dim' is not defined

In [14]:
train_subset.head()

Unnamed: 0,label,vectorized_comment
0,0,"[-0.23815727, 0.0650364, 0.274105, 0.47422692,..."
1,0,"[-0.06878692, 0.64647627, -0.0010952452, 0.650..."
2,1,"[-0.10693383, 0.044041224, 0.08560472, 0.07766..."
3,1,"[-0.24483381, -0.35850817, 0.5864438, -0.03253..."
4,1,"[-0.16459303, 0.44265932, 0.76053095, 0.579429..."


In [15]:
train_dirty_comments = train_dirty_comments.loc[:, ['label', 'vectorized_comment']]

In [16]:
%%time
xgb_model = xgb.XGBClassifier()
xgb_model.fit(list(train_dirty_comments.loc[:, 'vectorized_comment']), train_dirty_comments['label'], verbose=True)



CPU times: user 1h 3min 24s, sys: 17.5 s, total: 1h 3min 42s
Wall time: 8min 25s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [187]:
y_pred_1 = xgb_model.predict(list(norm_test_data['vectorized_comment']))

metrics(y_pred_1, norm_test_data['label'].astype(int))

Unnamed: 0,Predicted Y=0,Predicted Y=1
True Y=0,70150,38562
True Y=1,30805,62649


Accuracy: 0.6568809789974576
Sensitivity: 0.6703725897232863
Specificity: 0.6452829494444036
Precision: 0.6189939828674749
ROC-AUC: 0.6578277695838448
F1 score: 0.6437


In [189]:
type(y_pred_1)

numpy.ndarray

In [194]:
np.save('xgb_baseline_dirty_comments', y_pred_1)

array([1, 0, 1, ..., 0, 0, 0])

<h3>XGBoost with only clean comments (To compare against the uncleaned comments)</h3>

In [26]:
train_clean_comments = norm_train_data.loc[:, ['label', 'vectorized_clean_comment']].copy()

In [27]:
xgb_baseline_model_clean = xgb.XGBClassifier()
xgb_model.fit(list(train_clean_comments.loc[:, 'vectorized_clean_comment']), train_clean_comments['label'], verbose=True)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
y_pred_clean_comments = xgb_model.predict(list(norm_test_data['vectorized_clean_comment']))

In [29]:
metrics(y_pred_clean_comments, test_labels)

Unnamed: 0,Predicted Y=0,Predicted Y=1
True Y=0,69644,38205
True Y=1,31311,63006


Accuracy: 0.656143960903416
Sensitivity: 0.6680237921053469
Specificity: 0.6457547126074419
Precision: 0.6225212674511664
ROC-AUC: 0.6568892523563944
F1 score: 0.6445


<h3> XG boost with all features with vectorized comments </h3>
Note: basic XGboost without any feature selection and hyperparameter tuning

In [30]:
norm_train_data.columns

Index(['index', 'label', 'comment', 'author', 'subreddit', 'score', 'ups',
       'downs', 'date', 'created_utc', 'parent_comment', 'word_count',
       'punctuation_count', 'has_repeated', 'exclaim_count', 'qns_mark_count',
       'ellipses_mark_count', 'interjection_count', 'laughter_words_count',
       'capitalized_word_count', 'partial_capital_word_count',
       'emoticon_count', 'clean_comment', 'lemmatized_comment',
       'lemmatized_parent_comment', 'clean_parent_comment',
       'lemmatized_clean_comment', 'clean_lemmatized_parent_comment',
       'vectorized_comment', 'vectorized_clean_comment', 'vectorized_parent',
       'vectorized_clean_parent', 'cosine_similarity_dirty_comments',
       'cosine_similarity_clean_comments', 'comment_sentiment',
       'parent_comment_sentiment'],
      dtype='object')

In [295]:
train_with_engineered_features = norm_train_data.copy().drop(['vectorized_clean_comment', 'vectorized_parent', 'vectorized_clean_parent',
                                                            'comment', 'parent_comment', 'clean_comment', 'clean_parent_comment',
                                                             'lemmatized_comment', 'lemmatized_clean_comment', 'lemmatized_parent_comment', 'clean_lemmatized_parent_comment', 
                                                             'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'cosine_similarity_clean_comments',
                                                             'parent_comment_sentiment'], axis= 1)


In [296]:
train_with_engineered_features = train_with_engineered_features.drop(['index'], axis=1)

In [297]:
train_with_engineered_features.columns

Index(['label', 'word_count', 'punctuation_count', 'has_repeated',
       'exclaim_count', 'qns_mark_count', 'ellipses_mark_count',
       'interjection_count', 'laughter_words_count', 'capitalized_word_count',
       'partial_capital_word_count', 'emoticon_count', 'vectorized_comment',
       'cosine_similarity_dirty_comments', 'comment_sentiment'],
      dtype='object')

In [298]:
train_data_all_engineered_features.head()

Unnamed: 0,label,word_count,punctuation_count,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,cosine_similarity_clean_comments,comment_sentiment,parent_comment_sentiment
0,0,0.002251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.552133,,
1,0,0.004052,0.000102,0.0,0.0,0.0,0.0,0.0,0.0,0.001203,0.0,0.0,0.579743,,
2,1,0.00045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.510553,,
3,1,0.004052,0.000306,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.423121,,
4,1,0.011256,0.00051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471443,,


In [299]:
def sentiment_to_numeric(x):
    if x == 'neu':
        return 0
    elif x == 'pos':
        return 1
    elif x == 'neg': 
        return -1

train_with_engineered_features['comment_sentiment'] = train_with_engineered_features['comment_sentiment'].apply(sentiment_to_numeric)

In [300]:
train_with_engineered_features.head()

Unnamed: 0,label,word_count,punctuation_count,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,vectorized_comment,cosine_similarity_dirty_comments,comment_sentiment
0,0,0.002251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.23815727, 0.0650364, 0.274105, 0.47422692,...",0.538655,-1
1,0,0.004052,0.000102,0.0,0.0,0.0,0.0,0.0,0.0,0.001203,0.0,0.0,"[-0.06878692, 0.64647627, -0.0010952452, 0.650...",0.582335,1
2,1,0.00045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.10693383, 0.044041224, 0.08560472, 0.07766...",0.515965,0
3,1,0.004052,0.000306,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.24483381, -0.35850817, 0.5864438, -0.03253...",0.533188,0
4,1,0.011256,0.00051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-0.16459303, 0.44265932, 0.76053095, 0.579429...",0.496283,-1


In [301]:
%%time
cols_to_include = ['cosine_similarity_dirty_comments', 'word_count', 'punctuation_count', 'has_repeated', 
                  'exclaim_count', 'qns_mark_count', 'ellipses_mark_count', 'interjection_count', 'laughter_words_count', 
                   'capitalized_word_count', 'partial_capital_word_count', 'emoticon_count','comment_sentiment']

CPU times: user 50 µs, sys: 525 µs, total: 575 µs
Wall time: 1.69 ms


In [302]:
%%time
X_train_arr = pd.DataFrame(train_with_engineered_features['vectorized_comment'].tolist()).join(train_with_engineered_features[cols_to_include])

CPU times: user 39.6 s, sys: 33.2 s, total: 1min 12s
Wall time: 1min 39s


In [303]:
X_train_arr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,comment_sentiment
0,-0.238157,0.065036,0.274105,0.474227,-0.621571,-0.069404,0.174043,0.008883,-0.124769,0.213284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
1,-0.068787,0.646476,-0.001095,0.65045,-0.583917,-0.192484,0.059875,0.275911,0.214799,0.530081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001203,0.0,0.0,1
2,-0.106934,0.044041,0.085605,0.077661,-0.173773,0.229178,-0.039625,0.034472,0.239187,0.244954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,-0.244834,-0.358508,0.586444,-0.032538,-0.859644,0.177933,-0.296081,0.00441,0.19423,0.360871,...,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0
4,-0.164593,0.442659,0.760531,0.57943,-1.001434,-0.723228,0.18769,-1.085732,0.177928,-0.013857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1


In [304]:
%%time
xgb_model_with_engineered_features.fit((X_train_arr),
                                       train_with_engineered_features['label'], verbose=True)



CPU times: user 1h 27min 59s, sys: 1min 6s, total: 1h 29min 5s
Wall time: 13min 5s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [241]:
test_with_engineered_features = norm_test_data.copy().drop(['vectorized_clean_comment', 'vectorized_parent_comment',
                                                            'vectorized_clean_parent_comment',
                                                            'comment', 'parent_comment', 'clean_comment', 'clean_parent_comment',
                                                             'lemmatized_comment', 'lemmatized_clean_comment', 'lemmatized_parent_comment',
                                                            'lemmatized_clean_parent_comment', 'author', 'subreddit', 'score',
                                                            'ups', 'downs', 'date', 'created_utc', 'cosine_similarity_clean_comments',
                                                             'index'], axis= 1)

In [281]:
len(train_with_engineered_features.columns)

15

In [282]:
len(test_with_engineered_features.columns)

15

In [242]:
test_with_engineered_features['comment_sentiment'] = test_with_engineered_features['comment_sentiment'].apply(sentiment_to_numeric)

In [305]:
X_test_df = pd.DataFrame(test_with_engineered_features['vectorized_comment'].tolist()).join(test_with_engineered_features[cols_to_include])

In [260]:
X_test_df.iloc[0, 100]

array([-0.2260461 , -0.5411476 , -0.5956079 ,  1.6732603 , -0.35423955,
       -0.28117514,  1.0684463 ,  0.13423854, -0.8507427 ,  1.8262612 ,
       -0.7148748 ,  0.4975234 , -0.86549145, -0.6982111 ,  0.2134533 ,
        0.18298747,  0.5885876 , -0.76619554,  1.0453223 , -0.90714866,
       -0.69089735,  1.1539632 ,  0.81549114,  0.21821809,  0.13845769,
        0.7591418 ,  0.02087549, -0.16588932, -0.28434   ,  0.12744519,
        0.12692545,  0.5278704 ,  0.24552296,  0.34511566, -0.4861376 ,
       -0.6935075 , -1.005111  ,  0.3457867 ,  1.1153344 ,  0.74597263,
       -0.12379747,  0.04674913,  1.4676919 ,  0.14065263, -1.2901154 ,
        0.08728639,  0.04770548,  1.5965544 , -0.2411312 , -0.31156746,
       -0.02273787, -1.2519696 ,  0.13721712, -0.20597284,  1.1222237 ,
       -0.28557092, -0.3173024 , -0.8671589 , -1.1810979 , -1.302789  ,
       -1.290435  ,  0.01784645, -0.6765577 , -0.46088463,  0.18265541,
       -1.1688226 ,  0.5169917 , -0.3895155 ,  0.7157061 ,  0.18

In [257]:
X_test_df = X_test_df.drop(columns='vectorized_comment')

In [306]:
X_test_df.shape

(202166, 113)

In [307]:
y_pred_dirty_comments_w_engineered_features = xgb_model_with_engineered_features.predict(X_test_df)

In [309]:
metrics(y_pred_dirty_comments_w_engineered_features, test_with_engineered_features['label'].astype(int))

Unnamed: 0,Predicted Y=0,Predicted Y=1
True Y=0,76839,42214
True Y=1,24116,58997


Accuracy: 0.6719032873974853
Sensitivity: 0.7098408191257686
Specificity: 0.6454184270870956
Precision: 0.582910948414698
ROC-AUC: 0.6776296231064323
F1 score: 0.6401


<H2>XGboost model with only features and no vectors</H2>

In [198]:
train_only_engineered_features = norm_train_data.copy().drop(['vectorized_clean_comment', 'vectorized_parent', 'vectorized_clean_parent',
                                                            'comment', 'parent_comment', 'clean_comment', 'clean_parent_comment',
                                                             'lemmatized_comment', 'lemmatized_clean_comment', 'lemmatized_parent_comment', 'clean_lemmatized_parent_comment', 
                                                             'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'cosine_similarity_clean_comments',
                                                             'parent_comment_sentiment', 'vectorized_comment'], axis= 1)

In [200]:

print(train_only_engineered_features.shape)

(807647, 15)


In [201]:
train_only_engineered_features.head()

Unnamed: 0,index,label,word_count,punctuation_count,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,cosine_similarity_dirty_comments,comment_sentiment
0,417033,0,0.002251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.538655,neg
1,59081,0,0.004052,0.000102,0.0,0.0,0.0,0.0,0.0,0.0,0.001203,0.0,0.0,0.582335,pos
2,5664,1,0.00045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.515965,neu
3,366838,1,0.004052,0.000306,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.533188,neu
4,907940,1,0.011256,0.00051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.496283,neg


In [202]:
train_only_engineered_features = train_only_engineered_features.drop(['index'], axis=1)

In [204]:
train_only_engineered_features['comment_sentiment'] = train_only_engineered_features['comment_sentiment'].apply(sentiment_to_numeric)

In [205]:
xgb_model_2 = xgb.XGBClassifier()
xgb_model_2.fit(train_only_engineered_features.iloc[:, 1:], train_only_engineered_features['label'], verbose=True)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [208]:
norm_test_data.columns

Index(['index', 'label', 'comment', 'author', 'subreddit', 'score', 'ups',
       'downs', 'date', 'created_utc', 'parent_comment', 'lemmatized_comment',
       'lemmatized_parent_comment', 'clean_comment', 'clean_parent_comment',
       'lemmatized_clean_comment', 'lemmatized_clean_parent_comment',
       'vectorized_comment', 'vectorized_clean_comment',
       'vectorized_parent_comment', 'vectorized_clean_parent_comment',
       'cosine_similarity_dirty_comments', 'cosine_similarity_clean_comments',
       'word_count', 'punctuation_count', 'has_repeated', 'exclaim_count',
       'qns_mark_count', 'ellipses_mark_count', 'interjection_count',
       'laughter_words_count', 'capitalized_word_count',
       'partial_capital_word_count', 'emoticon_count', 'comment_sentiment'],
      dtype='object')

In [211]:
# ['vectorized_parent' 'vectorized_clean_parent'\n 'clean_lemmatized_parent_comment' 'parent_comment_sentiment']
test_only_engineered_features = norm_test_data.copy().drop(['vectorized_clean_comment', 'vectorized_parent_comment',
                                                            'vectorized_clean_parent_comment',
                                                            'comment', 'parent_comment', 'clean_comment', 'clean_parent_comment',
                                                             'lemmatized_comment', 'lemmatized_clean_comment', 'lemmatized_parent_comment',
                                                            'lemmatized_clean_parent_comment', 'author', 'subreddit', 'score',
                                                            'ups', 'downs', 'date', 'created_utc', 'cosine_similarity_clean_comments',
                                                             'vectorized_comment', 'index'], axis= 1)

In [215]:
test_only_engineered_features['comment_sentiment'] = test_only_engineered_features['comment_sentiment'].apply(sentiment_to_numeric)

In [216]:
test_only_engineered_features.head()

Unnamed: 0,label,cosine_similarity_dirty_comments,word_count,punctuation_count,has_repeated,exclaim_count,qns_mark_count,ellipses_mark_count,interjection_count,laughter_words_count,capitalized_word_count,partial_capital_word_count,emoticon_count,comment_sentiment
0,1,0.452225,0.013007,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
1,0,0.451581,0.008504,0.014019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1,0.8253,0.001001,0.014019,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0
3,1,0.431966,0.008504,0.014019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
4,0,0.410118,0.010005,0.042056,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.142857,0.0,0


In [217]:
y_pred_only_engineered_features = xgb_model_2.predict(test_only_engineered_features.iloc[:,1:])

In [219]:
metrics(y_pred_only_engineered_features, test_only_engineered_features['label'].astype(int))

Unnamed: 0,Predicted Y=0,Predicted Y=1
True Y=0,100813,100433
True Y=1,142,778


Accuracy: 0.5025127865219671
Sensitivity: 0.8456521739130435
Specificity: 0.5009441181439631
Precision: 0.007686911501714241
ROC-AUC: 0.6732981460285032
F1 score: 0.0152


In [223]:
train_engineered_w_orig_features = norm_train_data.copy().drop(['vectorized_clean_comment', 'vectorized_parent', 'vectorized_clean_parent',
                                                            'comment', 'parent_comment', 'clean_comment', 'clean_parent_comment',
                                                             'lemmatized_comment', 'lemmatized_clean_comment', 'lemmatized_parent_comment', 'clean_lemmatized_parent_comment', 
                                                            'date', 'created_utc', 'cosine_similarity_clean_comments',
                                                             'parent_comment_sentiment', 'vectorized_comment', 'index'], axis= 1)

<H2>HyperParameter Tuning for baseline model with dirty comments</H2>

In [None]:
%%time
xgb_model_with_engineered_features.fit((X_train_arr),
                                       train_with_engineered_features['label'], verbose=True)
param_test1 = {
    'max_depth':range(6, 10, 3),
    'min_child_weight':range(6,10, 3),
#     'learning_rate':[i/100.0 for i in range(20,51, 10)],
    'n_estimators': range(100,301,100),
#     "early_stopping_rounds" : 10,
#     "eval_metric" = 'accuracy',
#     "eval_set" : [[X_test_df, test_with_engineered_features['label']]]
#      'reg_lambda':[i/10.0 for i in range(1,10)],
#      'reg_alpha':[i/10.0 for i in range(1,10)]
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(objective= 'binary:logistic', label_encoder=False, nthread=4), 
 param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train_arr, train_with_engineered_features['label'].astype(int), verbose=True)
gsearch.cv_results_, gsearch.best_params_, gsearch.best_score_





In [None]:
test_data_all_engineered_features = norm_test_data.copy().drop(['index','author','subreddit','score','ups','downs','parent_comment',
                      'clean_comment','lemmatized_comment','lemmatized_clean_comment','lemmatized_parent_comment','clean_parent_comment',
                      'clean_lemmatized_parent_comment','vectorized_comment',
                      'vectorized_clean_comment',	'vectorized_parent',	'vectorized_clean_parent','date',
                      'created_utc','comment','cosine_similarity_dirty_comments'], axis = 1)

test_data_all_engineered_features['comment_sentiment'] = test_data_all_engineered_features['comment_sentiment'].apply(sentiment_to_numeric)
test_data_all_engineered_features['parent_comment_sentiment'] = test_data_all_engineered_features['parent_comment_sentiment'].apply(sentiment_to_numeric)

In [None]:
y_pred_all_engineered_features = xgb_model.predict(test_data_all_engineered_features.iloc[: , 1: ])
metrics(y_pred_all_engineered_features, test_data_all_engineered_features['label'])