In [22]:
import pandas as pd
import numpy as np
import os, sys 
sys.path.append(os.environ['HOME'] + '/src/models/')
from deeplearning_models import DLTextClassifier
from feature_based_models import FBConstructivenessClassifier
from sklearn.model_selection import train_test_split

In [23]:
import matplotlib.pyplot as plt
%matplotlib inline

# classifiers / models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# other
from sklearn.preprocessing import normalize
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
import nltk
import time

import xgboost as xgb
from sklearn.metrics import f1_score, classification_report

### Train on C3 train and test on C3 test

In [24]:
C3_train_df = pd.read_csv(os.environ['C3_TRAIN'])
C3_train_df['pp_comment_text'] = C3_train_df['pp_comment_text'].astype(str)

C3_test_df = pd.read_csv(os.environ['C3_TEST'])
C3_test_df['pp_comment_text'] = C3_test_df['pp_comment_text'].astype(str)

In [25]:
y_C3_train = C3_train_df.constructive_binary     
X_C3_train = C3_train_df.drop(['constructive_binary'], axis = 1)

y_C3_test = C3_test_df.constructive_binary     
X_C3_test = C3_test_df.drop(['constructive_binary'], axis = 1)

In [26]:
classifier = FBConstructivenessClassifier(X_C3_train, y_C3_train, X_C3_test, y_C3_test)

In [6]:
models = {'logistic regression': LogisticRegression, 
          'SVM' : SGDClassifier, 
          'random forest' : RandomForestClassifier, 
          'xgboost' : xgb.XGBClassifier
         }

for model_name, model_class in models.items():
    t = time.time()
    print(model_name, ":")
    m = model_class()
    pipeline = classifier.train_pipeline(classifier = model_class())    
    classifier.show_scores(pipeline)
    elapsed_time = time.time() - t
    print("Elapsed time: %.1f s" % elapsed_time)
    print()

SVM :
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text




Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.92
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.89      0.92      0.91      1093
        1.0       0.94      0.91      0.92      1307

avg / total       0.92      0.92      0.92      2400

sklearn micro-F1-Score: 0.91625
Elapsed time: 42.2 s

xgboost :
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_s

  if diff:


Training accuracy:   0.95


  if diff:


Validation accuracy: 0.94


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.94      0.95      0.95      4391
        1.0       0.96      0.95      0.96      5209

avg / total       0.95      0.95      0.95      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.92      0.94      0.93      1093
        1.0       0.95      0.93      0.94      1307

avg / total       0.94      0.94      0.94      2400

sklearn micro-F1-Score: 0.9375
Elapsed time: 102.8 s

logistic regression :
Classifier:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accu

In [7]:
pipeline = classifier.train_pipeline(classifier = xgb.XGBClassifier())    
classifier.show_scores(pipeline)
classifier.write_model_scores_df(pipeline, results_csv_path = os.environ['HOME'] + 'models/feature_based_model_predictions.csv')

Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.95


  if diff:


Validation accuracy: 0.94


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.94      0.95      0.95      4391
        1.0       0.96      0.95      0.96      5209

avg / total       0.95      0.95      0.95      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.92      0.94      0.93      1093
        1.0       0.95      0.93      0.94      1307

avg / total       0.94      0.94      0.94      2400

sklearn micro-F1-Score: 0.9375


  if diff:


Predictions file written:  /home/vkolhatk/dev/constructiveness/models/feature_based_model_predictions.csv


In [13]:
def run_experiments_on_feature_sets(fb_classifier,
                                    classifier = xgb.XGBClassifier(),
                                    feature_sets = [['ngram_feats'], 
                                                    ['tfidf_feats'], 
                                                    ['length_feats'], 
                                                    ['argumentation_feats'], 
                                                    ['text_quality_feats'], 
                                                    ['named_entity_feats'], 
                                                    ['perspective_content_value_feats'], 
                                                    ['perspective_aggressiveness_feats'], 
                                                    ['perspecitive_toxicity_feats']]):
    
    all_feats = []
    for feature_set in feature_sets: 
        print('\n\nFeature set:', feature_set)
        all_feats.extend(feature_set)
        pipeline = fb_classifier.train_pipeline(classifier = classifier, 
                                             feature_set = feature_set)    
        fb_classifier.show_scores(pipeline)

    print('\n\nResults with all features: ', all_feats)    
    pipeline = fb_classifier.train_pipeline(classifier = classifier, 
                                         feature_set = all_feats)    
    fb_classifier.show_scores(pipeline)

        

### Train on C3 train and test on NYT

In [27]:
C3_train_df = pd.read_csv(os.environ['C3_TRAIN'])
C3_train_df['pp_comment_text'] = C3_train_df['pp_comment_text'].astype(str)

#NYT_df = pd.read_csv(os.environ['NYT_YNACC'])
NYT_df = pd.read_csv(os.environ['HOME'] + 'data/processed/NYT_YNACC_feats_preprocessed.csv')
NYT_df['pp_comment_text'] = NYT_df['pp_comment_text'].astype(str)

C3_test_df = pd.read_csv(os.environ['C3_TEST'])
C3_test_df['pp_comment_text'] = C3_test_df['pp_comment_text'].astype(str)

In [28]:
y_C3_train = C3_train_df.constructive_binary     
X_C3_train = C3_train_df.drop(['constructive_binary'], axis = 1)

y_NYT = NYT_df.constructive_binary
X_NYT = NYT_df.drop(['constructive_binary'], axis = 1)

y_C3_test = C3_test_df.constructive_binary     
X_C3_test = C3_test_df.drop(['constructive_binary'], axis = 1)

In [29]:
fb_classifier = FBConstructivenessClassifier(X_C3_train, y_C3_train, X_NYT, y_NYT)

In [17]:
run_experiments_on_feature_sets(fb_classifier)



Feature set: ['ngram_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.83


  if diff:


Validation accuracy: 0.75


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.79      0.87      0.83      4391
        1.0       0.88      0.81      0.84      5209

avg / total       0.84      0.83      0.84      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.76      0.74      0.75     15178
        1.0       0.75      0.77      0.76     15147

avg / total       0.76      0.75      0.75     30325

sklearn micro-F1-Score: 0.7547568013190435


Feature set: ['tfidf_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['tfidf_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.90


  if diff:


Validation accuracy: 0.67


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.83      0.97      0.89      4391
        1.0       0.97      0.83      0.90      5209

avg / total       0.91      0.90      0.90      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.61      0.96      0.75     15178
        1.0       0.91      0.38      0.54     15147

avg / total       0.76      0.67      0.64     30325

sklearn micro-F1-Score: 0.6713932399010717


Feature set: ['length_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['length_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:
  if diff:
  if diff:
  if diff:


Training accuracy:   0.94
Validation accuracy: 0.83
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.93      0.94      0.94      4391
        1.0       0.95      0.94      0.95      5209

avg / total       0.94      0.94      0.94      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.88      0.77      0.82     15178
        1.0       0.79      0.89      0.84     15147

avg / total       0.84      0.83      0.83     30325

sklearn micro-F1-Score: 0.8307996702390767


Feature set: ['argumentation_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
   

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.92
Validation accuracy: 0.82
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.91      0.92      0.91      4391
        1.0       0.93      0.92      0.93      5209

avg / total       0.92      0.92      0.92      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.87      0.76      0.81     15178
        1.0       0.79      0.89      0.84     15147

avg / total       0.83      0.82      0.82     30325

sklearn micro-F1-Score: 0.824732069249794


Feature set: ['named_entity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:log

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.73
Validation accuracy: 0.72
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.66      0.86      0.75      4391
        1.0       0.84      0.63      0.72      5209

avg / total       0.76      0.73      0.73      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.68      0.84      0.75     15178
        1.0       0.79      0.60      0.68     15147

avg / total       0.74      0.72      0.72     30325

sklearn micro-F1-Score: 0.7226710634789777


Feature set: ['perspective_content_value_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objecti

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.89
Validation accuracy: 0.83
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.90      0.86      0.88      4391
        1.0       0.89      0.92      0.90      5209

avg / total       0.89      0.89      0.89      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.88      0.75      0.81     15178
        1.0       0.78      0.90      0.84     15147

avg / total       0.83      0.83      0.82     30325

sklearn micro-F1-Score: 0.8256553998351195


Feature set: ['perspective_aggressiveness_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, object

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.71
Validation accuracy: 0.75
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.67      0.71      0.69      4391
        1.0       0.74      0.71      0.73      5209

avg / total       0.71      0.71      0.71      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.79      0.68      0.73     15178
        1.0       0.72      0.82      0.77     15147

avg / total       0.76      0.75      0.75     30325



  if diff:
  if diff:
  if diff:
  if diff:


sklearn micro-F1-Score: 0.7513602638087387


Feature set: ['perspecitive_toxicity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.75
Validation accuracy: 0.77
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.74      0.70      0.72      4391
        1.0       0.76      0.80      0.78      5209

avg / total       0.75      0.75      0.75      9600



  if diff:
  if diff:
  if diff:
  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.80      0.72      0.76     15178
        1.0       0.75      0.82      0.78     15147

avg / total       0.77      0.77      0.77     30325

sklearn micro-F1-Score: 0.7689035449299259


Results with all features:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats',

  if diff:


Training accuracy:   0.95


  if diff:


Validation accuracy: 0.84


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.94      0.95      0.95      4391
        1.0       0.96      0.95      0.96      5209

avg / total       0.95      0.95      0.95      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.88      0.77      0.82     15178
        1.0       0.80      0.90      0.84     15147

avg / total       0.84      0.84      0.83     30325

sklearn micro-F1-Score: 0.8350865622423743


  if diff:


In [18]:
run_experiments_on_feature_sets(fb_classifier, classifier=SGDClassifier())



Feature set: ['ngram_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text




Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.81
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      4391
        1.0       1.00      0.99      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.82      0.80      0.81     15178
        1.0       0.81      0.83      0.82     15147

avg / total       0.81      0.81      0.81     30325

sklearn micro-F1-Score: 0.8143775762572134


Feature set: ['tfidf_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       s



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.68
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.93      0.38      0.54     15178
        1.0       0.61      0.97      0.75     15147

avg / total       0.77      0.68      0.65     30325

sklearn micro-F1-Score: 0.6763066776586975


Feature set: ['length_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       



Validation accuracy: 0.69
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.92      0.47      0.62      4391
        1.0       0.68      0.96      0.80      5209

avg / total       0.79      0.74      0.72      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.91      0.43      0.59     15178
        1.0       0.63      0.96      0.76     15147

avg / total       0.77      0.69      0.67     30325

sklearn micro-F1-Score: 0.6945754328112119


Feature set: ['text_quality_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['text_quality_feats']
COMMENTS COL:  pp_commen



TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.74      0.62      0.68      4391
        1.0       0.72      0.82      0.77      5209

avg / total       0.73      0.73      0.73      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.75      0.67      0.70     15178
        1.0       0.70      0.78      0.74     15147

avg / total       0.72      0.72      0.72     30325

sklearn micro-F1-Score: 0.7206265457543282


Feature set: ['perspective_content_value_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['perspective_content_value_feats']
COMMENTS COL:  pp_commen



TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.62      0.35      0.44      4391
        1.0       0.60      0.82      0.69      5209

avg / total       0.61      0.60      0.58      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.89      0.56      0.69     15178
        1.0       0.68      0.93      0.78     15147

avg / total       0.78      0.75      0.74     30325

sklearn micro-F1-Score: 0.7458532563891178


Feature set: ['perspecitive_toxicity_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text
M



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.82
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       1.00      0.99      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.88      0.74      0.80     15178
        1.0       0.78      0.90      0.83     15147

avg / total       0.83      0.82      0.82     30325

sklearn micro-F1-Score: 0.8199175597691674


### Train on C3 train and test on C3 test

In [30]:
fb_classifier = FBConstructivenessClassifier(X_C3_train, y_C3_train, X_C3_test, y_C3_test)

In [20]:
run_experiments_on_feature_sets(fb_classifier, classifier=SGDClassifier())



Feature set: ['ngram_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text




Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.88
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      4391
        1.0       1.00      0.99      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.83      0.93      0.88      1093
        1.0       0.94      0.84      0.88      1307

avg / total       0.89      0.88      0.88      2400

sklearn micro-F1-Score: 0.8804166666666666


Feature set: ['tfidf_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       s



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.72
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.95      0.40      0.56      1093
        1.0       0.66      0.98      0.79      1307

avg / total       0.79      0.72      0.69      2400

sklearn micro-F1-Score: 0.7179166666666665


Feature set: ['length_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       



Validation accuracy: 0.90
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.86      0.95      0.90      4391
        1.0       0.95      0.87      0.91      5209

avg / total       0.91      0.90      0.90      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.85      0.95      0.90      1093
        1.0       0.96      0.86      0.91      1307

avg / total       0.91      0.90      0.90      2400

sklearn micro-F1-Score: 0.9045833333333332


Feature set: ['named_entity_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['named_entity_feats']
COMMENTS COL:  pp_commen



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.67
Validation accuracy: 0.67
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.63      0.67      0.65      4391
        1.0       0.71      0.67      0.69      5209

avg / total       0.67      0.67      0.67      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.63      0.67      0.65      1093
        1.0       0.71      0.66      0.68      1307

avg / total       0.67      0.67      0.67      2400

sklearn micro-F1-Score: 0.6670833333333334


Results with all features:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=N



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.91
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.89      0.92      0.90      1093
        1.0       0.93      0.90      0.92      1307

avg / total       0.91      0.91      0.91      2400

sklearn micro-F1-Score: 0.9091666666666667


In [31]:
run_experiments_on_feature_sets(fb_classifier)



Feature set: ['ngram_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.83


  if diff:


Validation accuracy: 0.81


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.79      0.87      0.83      4391
        1.0       0.88      0.81      0.84      5209

avg / total       0.84      0.83      0.84      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.77      0.84      0.80      1093
        1.0       0.86      0.79      0.82      1307

avg / total       0.82      0.81      0.81      2400

sklearn micro-F1-Score: 0.81375


Feature set: ['tfidf_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['tfidf_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.90


  if diff:


Validation accuracy: 0.60


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.83      0.97      0.89      4391
        1.0       0.97      0.83      0.90      5209

avg / total       0.91      0.90      0.90      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.53      1.00      0.70      1093
        1.0       0.99      0.27      0.43      1307

avg / total       0.78      0.60      0.55      2400

sklearn micro-F1-Score: 0.6025


Feature set: ['length_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['length_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:
  if diff:
  if diff:
  if diff:


Training accuracy:   0.94
Validation accuracy: 0.93
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.93      0.94      0.94      4391
        1.0       0.95      0.94      0.95      5209

avg / total       0.94      0.94      0.94      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.92      0.93      0.93      1093
        1.0       0.94      0.94      0.94      1307

avg / total       0.93      0.93      0.93      2400

sklearn micro-F1-Score: 0.9345833333333333


Feature set: ['argumentation_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
   

  if diff:
  if diff:
  if diff:
  if diff:


Training accuracy:   0.77
Validation accuracy: 0.76
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.76      0.73      0.75      4391
        1.0       0.78      0.81      0.79      5209

avg / total       0.77      0.77      0.77      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.74      0.73      0.74      1093
        1.0       0.78      0.79      0.78      1307

avg / total       0.76      0.76      0.76      2400

sklearn micro-F1-Score: 0.7633333333333333


Feature set: ['text_quality_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
    

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


Training accuracy:   0.92
Validation accuracy: 0.92
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.91      0.92      0.91      4391
        1.0       0.93      0.92      0.93      5209

avg / total       0.92      0.92      0.92      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.90      0.92      0.91      1093
        1.0       0.93      0.91      0.92      1307

avg / total       0.92      0.92      0.92      2400

sklearn micro-F1-Score: 0.9154166666666665


Feature set: ['named_entity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
    

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.71
Validation accuracy: 0.69
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.67      0.71      0.69      4391
        1.0       0.74      0.71      0.73      5209

avg / total       0.71      0.71      0.71      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.64      0.70      0.67      1093
        1.0       0.73      0.67      0.70      1307

avg / total       0.69      0.69      0.69      2400

sklearn micro-F1-Score: 0.6870833333333334


Feature set: ['perspecitive_toxicity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.75
Validation accuracy: 0.72
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.74      0.70      0.72      4391
        1.0       0.76      0.80      0.78      5209

avg / total       0.75      0.75      0.75      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.71      0.67      0.69      1093
        1.0       0.74      0.77      0.75      1307

avg / total       0.72      0.72      0.72      2400

sklearn micro-F1-Score: 0.7237499999999999


Results with all features:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample

  if diff:
  if diff:
  if diff:
  if diff:


Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.95


  if diff:


Validation accuracy: 0.94


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.94      0.95      0.95      4391
        1.0       0.96      0.95      0.96      5209

avg / total       0.95      0.95      0.95      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.92      0.94      0.93      1093
        1.0       0.95      0.93      0.94      1307

avg / total       0.94      0.94      0.94      2400

sklearn micro-F1-Score: 0.9375


  if diff:


### Train on NYT and test on C3 test 

In [None]:
fb_classifier = FBConstructivenessClassifier(X_NYT, y_NYT, X_C3_test, y_C3_test)

run_experiments_on_feature_sets(fb_classifier, classifier=SGDClassifier())



Feature set: ['ngram_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text




Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.99
Validation accuracy: 0.83
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.98      1.00      0.99     15178
        1.0       1.00      0.98      0.99     15147

avg / total       0.99      0.99      0.99     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.77      0.90      0.83      1093
        1.0       0.91      0.78      0.84      1307

avg / total       0.85      0.83      0.84      2400

sklearn micro-F1-Score: 0.835


Feature set: ['tfidf_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, 



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.97
Validation accuracy: 0.72
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.97      0.98      0.97     15178
        1.0       0.98      0.97      0.97     15147

avg / total       0.97      0.97      0.97     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.88      0.45      0.59      1093
        1.0       0.67      0.95      0.79      1307

avg / total       0.77      0.72      0.70      2400

sklearn micro-F1-Score: 0.7208333333333333


Feature set: ['length_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       



Training accuracy:   0.84
Validation accuracy: 0.86
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.85      0.81      0.83     15178
        1.0       0.82      0.86      0.84     15147

avg / total       0.84      0.84      0.84     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.82      0.89      0.85      1093
        1.0       0.90      0.83      0.86      1307

avg / total       0.86      0.86      0.86      2400

sklearn micro-F1-Score: 0.8583333333333333


Feature set: ['argumentation_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['argumentation_feat



TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.84      0.82      0.83     15178
        1.0       0.83      0.84      0.84     15147

avg / total       0.83      0.83      0.83     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.80      0.95      0.87      1093
        1.0       0.95      0.80      0.87      1307

avg / total       0.88      0.87      0.87      2400

sklearn micro-F1-Score: 0.87


Feature set: ['named_entity_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['named_entity_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file



Training accuracy:   0.78
Validation accuracy: 0.61
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.84      0.70      0.77     15178
        1.0       0.74      0.87      0.80     15147

avg / total       0.79      0.78      0.78     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.58      0.52      0.55      1093
        1.0       0.63      0.69      0.66      1307

avg / total       0.61      0.61      0.61      2400

sklearn micro-F1-Score: 0.61125


Feature set: ['perspecitive_toxicity_feats']
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['perspecitive_toxicity



Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.78
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       1.00      1.00      1.00     15178
        1.0       1.00      1.00      1.00     15147

avg / total       1.00      1.00      1.00     30325

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.79      0.70      0.74      1093
        1.0       0.77      0.84      0.80      1307

avg / total       0.78      0.78      0.78      2400

sklearn micro-F1-Score: 0.7770833333333333


In [None]:
run_experiments_on_feature_sets(fb_classifier)



Feature set: ['ngram_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['ngram_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.83


  if diff:


Validation accuracy: 0.75


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.78      0.91      0.84     15178
        1.0       0.90      0.75      0.81     15147

avg / total       0.84      0.83      0.83     30325



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.67      0.89      0.76      1093
        1.0       0.87      0.63      0.73      1307

avg / total       0.78      0.75      0.74      2400

sklearn micro-F1-Score: 0.7466666666666667


Feature set: ['tfidf_feats']
Classifier:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Feature set:  ['tfidf_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5


  if diff:


Training accuracy:   0.84


  if diff:


Validation accuracy: 0.64


### Train on C3 train and test on NYT