In [1]:
from pandas import DataFrame, read_csv, concat, read_table, Series
from operator import itemgetter
import importlib
import re 
import string
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
import model_helper_functions
importlib.reload(model_helper_functions)
from model_helper_functions import *
import preprocess_helper_functions
importlib.reload(preprocess_helper_functions)
from preprocess_helper_functions import *
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import *
stemmer = PorterStemmer()
from urlextract import URLExtract
extractor = URLExtract()
rare_word_thresh = 1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
abbrev_words_dict = pickle.load(open('../data/abbreviated_words_map.csv', 'rb'))

In [3]:
def list_to_dict(mylist):
    mylist = [str(x).lower() for x in mylist]
    ret_dict = dict(zip(mylist, [1]*len(mylist))) #make dictionary for efficient search
    return ret_dict

In [4]:
all_badlist = read_table('../data/profane_list.txt', header=None, comment='#')
all_badlist = all_badlist.values.flatten().tolist()
all_badlist = sorted(set([x.lower() for x in all_badlist]))
all_badlist = [strip_non_printable_chars(str(x)).strip() for x in all_badlist]
#all_badlist_dict = dict(zip(all_badlist, [1] * len(all_badlist)))

#Read compiled acronyms and see if any acronyms are there
acronyms = read_csv('../data/compiled_acronyms_final.csv', encoding = 'latin-1')
acronyms = acronyms.apply(lambda x: x.astype(str).str.lower())
acronyms = acronyms.dropna()
acronyms = acronyms.drop_duplicates().reset_index(drop=True)
global acronyms_dict 
acronyms_dict = dict(acronyms.values)

from nltk.corpus import words
from nltk.corpus import wordnet
proper_words = words.words()
proper_words = sorted(set([x.lower() for x in proper_words]))
global proper_words_dict
proper_words_dict = list_to_dict(proper_words)

proper_words.extend(all_badlist)
global proper_words_with_profane_dict
proper_words_with_profane_dict = list_to_dict(proper_words)

In [5]:
extreme_profane = read_csv('../data/profane_x_common.csv', header=None)
extreme_profane = extreme_profane.values.flatten().tolist()
extreme_profane = [x.lower() for x in extreme_profane]

badlist3 = read_csv('../data/profane_list_common.csv', encoding='latin-1')
badlist_common = badlist3.values.flatten().tolist()
badlist_common = [x.lower() for x in badlist_common]

all_badlist_combined = sorted(set(all_badlist + badlist_common + extreme_profane))
all_badlist_combined_dict = dict(zip(all_badlist_combined, [1] * len(all_badlist_combined)))

profane_list_map = read_csv('../data/profane_list_common_mapping.csv', header=None)
profane_list_map = profane_list_map.apply(lambda x: x.astype(str).str.lower())
profane_list_map = dict(profane_list_map.values)

In [6]:
citynames = read_table('../data/citynames.txt', sep='\t', header=None, encoding='utf-8')
citynames = citynames.iloc[:, 1].str.lower().values
citynames = list_to_dict(citynames)

#Look for countries
countries = read_table('../data/countries.txt', header=None).values.flatten()
countries = list_to_dict(countries)

#Look for nationalities
nationalities = read_table('../data/nationalities.txt', header=None).values.flatten()
nationalities = list_to_dict(nationalities)

ethnicities = read_table('../data/ethnicities.txt', header=None).values.flatten()
ethnicities = list_to_dict(ethnicities)

#Look for persons name
person_names = []
for fn in ['../data/names.first.female.txt', 
           '../data/names.first.male.txt',
           '../data/names.last.txt',
           '../data/muslim_names.txt',
           '../data/englishnames.txt',
          ]:
    with open(fn, 'r') as ofd:
        for line in ofd.readlines():
            person_names.append(line.rstrip('\n').lower())

spanishnames = read_csv('../data/spanishnames.csv', encoding='utf-8')['nombre'].str.lower().str.split(' ')
spanishnames = list(set(spanishnames.apply(Series).unstack().values))
person_names.extend(spanishnames)

person_names_dict = list_to_dict(person_names)

### Read the data 

In [7]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
train_data = read_csv("../data/train.csv")
train_data.head()
train_data.fillna('NULL', inplace= True)
train_data['profane'] = train_data[classes].any(axis = 1).astype(int)

In [9]:
y = train_data['profane'].values

### Get word frequency table

In [10]:
# Get the Word Frequency table
tmp = train_data['comment_text'].str.cat(sep=' ')
words = tmp.split()
word_dist_dict = nltk.FreqDist(words)
word_dist_dict_most_common = word_dist_dict.most_common()

In [11]:
word_dist_dict_df = DataFrame(word_dist_dict_most_common, columns=['raw_word', 'freq'])
word_dist_dict_df.to_csv('../data/word_dist_train.csv', index=False)
pickle.dump(word_dist_dict, open('../data/word_dist_train.pkl', 'wb'))

In [12]:
pp_steps = [
            ['raw'],  
            ['convert_to_lower'],
            ['remove_whitespaces'], 
            ['remove_leaky'], 
            ['trim_words_len'],
            ['strip_non_printable_chars'],
            ['replace_abbreviation_words'],
            ['replace_acronyms'],
            ['remove_stopwords'],
            ['remove_rare_words'],
            ['remove_non_alphanumeric'],
            ['remove_non_alphabet_words'],
            ['remove_words_containing_non_alphabets'],
            ['black_listed_words_regex_mapping'],
            ['check_if_proper_name_place_or_ethnicity'],
            ['replace_profane_words_using_fuzzy'],
            ['replace_common_words_using_fuzzy'],
            ['lemmatize_english_words'],
            ['stemming_english_words'],
            ['extract_info_from_url'],
       
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_leaky'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'replace_abbreviation_words', 'replace_acronyms'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'remove_stopwords'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'remove_rare_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'replace_abbreviation_words', 'replace_acronyms', 
#      'remove_stopwords', 'remove_rare_words'], 
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphanumeric'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphabet_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphanumeric', 
#      'remove_non_alphabet_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words', 'replace_acronyms',
#      'black_listed_words_regex_mapping'], 
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words', 'replace_acronyms',
#      'black_listed_words_regex_mapping', 'replace_profane_words_using_fuzzy', 
#      'replace_common_words_using_fuzzy'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words',  'replace_acronyms', 
#      'remove_leaky',  'extract_info_from_url',  'black_listed_words_regex_mapping', 
#      'replace_profane_words_using_fuzzy', 'replace_common_words_using_fuzzy', 
#      'check_if_proper_name_place_or_ethnicity'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'replace_abbreviation_words',  'replace_acronyms', 
#      'remove_leaky',  'remove_non_alphabet_words', 'remove_stopwords', 
#      'stemming_english_words'],
#     ['convert_to_lower', 'lemmatize_english_words'],
#     ['convert_to_lower','remove_non_alphabet_words', 
#      'black_listed_words_regex_mapping',  'replace_common_words_using_fuzzy', 
#      'stemming_english_words']
    ]

In [13]:
maxlen = 30

def get_corresponding_mapping(word_dist_dict_most_common, op_type):
    print(op_type)
    if op_type == 'raw':
        new_dict = dict([(x[0], x[0]) for x in word_dist_dict_most_common])
    elif op_type ==  'convert_to_lower': # remove white spaces 
        new_dict = convert_to_lower_from_dict(word_dist_dict_most_common)
    elif op_type ==  'remove_whitespaces': # remove white spaces 
        new_dict = remove_white_spaces_from_dict(word_dist_dict_most_common)
    elif op_type == 'remove_leaky':
        new_dict = remove_leaky_information_from_dict(word_dist_dict_most_common)
    elif op_type == 'extract_info_from_url':
        new_dict = extract_info_from_url(word_dist_dict_most_common, extractor)
    elif op_type == 'trim_words_len':
        new_dict = trim_words_len(word_dist_dict_most_common, maxlen)
    elif op_type == 'replace_abbreviation_words':
        new_dict = replace_abbreviation_words_from_dict(word_dist_dict_most_common, abbrev_words_dict)
    elif op_type == 'strip_non_printable_chars':
        new_dict = strip_non_printable_chars_from_dict(word_dist_dict_most_common)
    elif op_type == 'replace_acronyms':
        new_dict = replace_acronyms_from_dict(word_dist_dict_most_common, acronyms_dict, proper_words_with_profane_dict)
    elif op_type == 'remove_stopwords':
        new_dict = remove_stopwords_from_dict(word_dist_dict_most_common, stop_words_dict)
    elif op_type == 'remove_rare_words':
        new_dict = remove_rare_words_from_dict(word_dist_dict_most_common, word_dist_dict, rare_word_thresh)
    elif op_type == 'remove_non_alphanumeric':
        new_dict = remove_non_alphanumeric_from_dict(word_dist_dict_most_common)    
    elif op_type == 'remove_non_alphabet_words':
        new_dict = remove_non_alphabet_words(word_dist_dict_most_common)    
    elif op_type == 'remove_words_containing_non_alphabets':
        new_dict = remove_words_containing_non_alphabets_from_dict(word_dist_dict_most_common)    
    elif op_type == 'black_listed_words_regex_mapping':
        new_dict = black_listed_words_regex_mapping_from_dict(word_dist_dict_most_common, all_badlist, profane_list_map, extreme_profane)
    elif op_type == 'replace_profane_words_using_fuzzy':
        new_dict = replace_profane_words_using_fuzzy(word_dist_dict_most_common, proper_words_dict, extreme_profane, profane_list_map, badlist_common)
    elif op_type == 'check_if_proper_name_place_or_ethnicity':
        new_dict = check_if_proper_name_place_or_ethnicity_from_dict(word_dist_dict_most_common, proper_words_dict, citynames, countries, nationalities, \
                                              ethnicities, person_names_dict)
    elif op_type == 'replace_common_words_using_fuzzy':
        new_dict = replace_common_words_using_fuzzy(word_dist_dict_most_common, word_dist_dict_most_common, wordnet_lemmatizer, proper_words_dict)
    elif op_type == 'lemmatize_english_words':
        new_dict = lemmatize_english_words(word_dist_dict_most_common, wordnet_lemmatizer)
    elif op_type == 'stemming_english_words':
        new_dict = stemming_english_words(word_dist_dict_most_common, stemmer)
    else:
        print ("Error......")
        new_dict = 'Error .......'
    return new_dict

def get_new_distribution(X):
    tmp = X.str.cat(sep=' ')
    words = tmp.split()
    w_d_dict = nltk.FreqDist(words) #Word distribution dict
    w_d_most_common = w_d_dict.most_common()
    return w_d_most_common

# def get_corresponding_mapping_multiple(word_dist_dict_most_common, op_types):
#     curr_list = word_dist_dict_most_common.copy()
#     new_dict = {}
#     for op_type in op_types:
#         print (op_type)
#         tmp_map = get_corresponding_mapping(curr_list, op_type)
#         if not new_dict:
#             new_dict = tmp_map
#         else: 
#             new_dict = update_dict_with_next_level_val(new_dict, tmp_map)
#         curr_list = [(new_dict[x], 1) for x in new_dict]
#     return new_dict

def get_corresponding_mapping_multiple(word_dist_dict_most_common, op_types, train_data):
    curr_word_dist_dict_most_common = word_dist_dict_most_common.copy()
    comment_data = train_data['comment_text']
    for op_type in op_types:
        print (op_type)
        new_mapped_dict = get_corresponding_mapping(curr_word_dist_dict_most_common, op_type)
        comment_data = comment_data.apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        curr_word_dist_dict_most_common = get_new_distribution(comment_data)
    return comment_data

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words_dict = dict(zip(stop_words, [1] * len(stop_words)))

### Calling NB SVM algorithm

In [None]:
combined_results = pickle.load(open("../data/results_nbsvm.pkl", "rb"))

In [None]:
#Add the preprocessing step here 
import pickle
# combined_results = {} #Uncomment 
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_NB_SVM_algorithm(X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1
#         if k_fold_num ==2:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_nbsvm.pkl", "wb"))

### Calling Logistic regression

In [30]:
combined_results = pickle.load(open("../data/results_logit.pkl", "rb"))

In [31]:
#Add the preprocessing step here 
# combined_results = {} #Uncomment 
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_logreg_algorithm(X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_logit.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7351133951513797
Log Loss:  1.5395094535271634
Accuracy:  0.9554267452061662
AUC score:  0.9715448633514437
Num of comments missclassified:  7113
[[142597    753]
 [  6360   9870]]
             precision    recall  f1-score   support

          0    0.95730   0.99475   0.97567    143350
          1    0.92912   0.60813   0.73511     16230

avg / total    0.95444   0.95543   0.95120    159580

convert_to_lower
convert_to_lower
remove_whitespaces
remove

remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
remove_non_alphanumeric
remove_non_alphanumeric
remove_non_alphabet_words
remove_non_alphabet_words


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7278044391121777
Log Loss:  1.571325386206043
Accuracy:  0.9545055771399925
AUC score:  0.9702170318930805
Num of comments missclassified:  7260
[[142614    736]
 [  6524   9706]]
             precision    recall  f1-score   support

          0    0.95626   0.99487   0.97518    143350
          1    0.92952   0.59803   0.72780     162

Done  50000
Done  100000
Done  150000
Done  200000
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7518747724790681
Log Loss:  1.4752289130227163
Accuracy:  0.9572878806868028
AUC score:  0.9738810422894987
Num of comments missclassified:  6816
[[142437    913]
 [  5903  10327]]
             precision    recall  f1-score   support

          0    0.96021   0.99363   0.97663    143350
          1    0.91877   0.63629   0.75187     16230

avg / total    0.95599   0.95729   0.95377    159580



### Call XGBoost Algorithm 

In [19]:
combined_results = pickle.load(open("../data/results_xgboost.pkl", "rb"))
len(combined_results.keys())

35

In [22]:
#Add the preprocessing step here 
# combined_results = {} #Uncomment 
for pp_step in pp_steps[20:]:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_xgboost_algorithm(xgb, vectorizer, X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_xgboost.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5734956461465147
Log Loss:  2.1308086271067452
Accuracy:  0.9383068053640807
AUC score:  0.919502285875283
Num of comments missclassified:  9845
[[143116    234]
 [  9611   6619]]
             precision    recall  f1-score   support

          0    0.93707   0.99837   0.96675    143350
          1    0.96585   0.40783   0.57350     16230

avg / total    0.94000   0.93831   0.92675    159580

convert_to_lower
convert_to_lower
remove_whitespaces
remove_

remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
remove_non_alphanumeric
remove_non_alphanumeric
remove_non_alphabet_words
remove_non_alphabet_words


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5723701326858035
Log Loss:  2.134488020511424
Accuracy:  0.938200275723775
AUC score:  0.9161101140068613
Num of comments missclassified:  9862
[[143118    232]
 [  9630   6600]]
             precision    recall  f1-score   support

          0    0.93695   0.99838   0.96669    143350
          1    0.96604   0.40665   0.57237     1623

Done  50000
Done  100000
Done  150000
Done  200000
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.6348413510747185
Log Loss:  1.9303904003526295
Accuracy:  0.9441095375360321
AUC score:  0.9327217975986543
Num of comments missclassified:  8919
[[142908    442]
 [  8477   7753]]
             precision    recall  f1-score   support

          0    0.94400   0.99692   0.96974    143350
          1    0.94606   0.47770   0.63484     16230

avg / total    0.94421   0.94411   0.93568    159580



# Fasttext Algorithm

In [20]:
combined_results = pickle.load(open("../data/individual_fasttext.pkl", "rb"))
len(combined_results.keys())

35

In [68]:
#combined_results = {}
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        texts_train = X_train_data.values
        texts_test  = X_test_data.values
        train = X.copy()
        X_train, X_test, embedding_matrix = preprocess_data_for_fasttext(texts_train, texts_test, train)
        res = call_fasttext_algorithm(X_train, y_train, X_test, y_test, embedding_matrix)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/individual_fasttext_part.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
replace_abbreviation_words
replace_abbreviation_words
replace_acronyms
replace_acronyms
remove_leaky
remove_leaky
remove_non_alphabet_words
remove_non_alphabet_words
remove_stopwords
remove_stopwords
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000
Done  200000


 *****Processing fold  0  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ..

# <font color = 'red' > Greedy to select the best transformation sequence  </color>

In [29]:
#Add the preprocessing step here
def call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data, algo):
    combined_results = {} #Uncomment 
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        vocab = set(' '.join(X.values).split())
        vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train = vectorizer.fit_transform(X_train_data)
            X_test = vectorizer.transform(X_test_data)
            if algo == 'logit':
                res = call_logreg_algorithm(X_train, y_train, X_test, y_test)
            elif algo == 'nbsvm':
                res = call_NB_SVM_algorithm(X_train, y_train, X_test, y_test)
            else:
                return 'ERROR'
            results.append(res)
            k_fold_num += 1

#             if k_fold_num ==1:
#                 break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        import pickle
        pickle.dump(combined_results, open("../data/results_logit_1.pkl", "wb"))
    return combined_results, X

In [17]:
#Add the preprocessing step here 
#Add the preprocessing step here
def call_xgboost_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data):
    combined_results = {} #Uncomment 
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        vocab = set(' '.join(X.values).split())
        vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            #print (len(train_index), len(test_index))
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train = vectorizer.fit_transform(X_train_data)
            X_test = vectorizer.transform(X_test_data)
            res = call_xgboost_algorithm(xgb, vectorizer, X_train, y_train, X_test, y_test)
            results.append(res)
            k_fold_num += 1

    #         if k_fold_num ==1:
    #             break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        import pickle
        pickle.dump(combined_results, open("../data/results_xgboost_1.pkl", "wb"))
    return combined_results, X

In [18]:
def get_words_dict_most_common(df):
    # Get the Word Frequency table
    tmp = df['comment_text'].str.cat(sep=' ')
    words = tmp.split()
    word_dist_dict = nltk.FreqDist(words)
    word_dist_dict_most_common = word_dist_dict.most_common()
    return word_dist_dict_most_common

In [17]:
transformations_orig = [             
            ['convert_to_lower'],
            ['remove_whitespaces'], 
            ['remove_leaky'], 
            ['trim_words_len'],
            ['strip_non_printable_chars'],
            ['replace_abbreviation_words'],
            ['replace_acronyms'],
            ['remove_stopwords'],
            ['remove_rare_words'],
            ['remove_non_alphanumeric'],
            ['remove_non_alphabet_words'],
            ['remove_words_containing_non_alphabets'],
            ['black_listed_words_regex_mapping'],
            ['check_if_proper_name_place_or_ethnicity'],
            ['replace_profane_words_using_fuzzy'],
            ['replace_common_words_using_fuzzy'],
            ['lemmatize_english_words'],
            ['stemming_english_words'],
            ['extract_info_from_url']
                  ]

# Logit

In [25]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

tr_scores = []
transformations = transformations_orig.copy()
pp_steps = ['black_listed_words_regex_mapping']
max_f1 = 0.7488
tr_scores.append((pp_steps[0], max_f1))
transformations.remove(pp_steps)

new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_steps, train_data_copy)
X = train_data_copy['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
train_data_copy['comment_text'] = X
word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)

black_listed_words_regex_mapping
black_listed_words_regex_mapping
Done  50000
Done  100000
Done  150000
Done  200000
Done  250000
Done  300000
Done  350000
Done  400000
Done  450000
Done  500000


In [26]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    X_best = None
    tr_best = None
    max_f1_local = max_f1
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy, 'logit')
        tr_iter_score.append((pp_steps[0][0], base_res))
        if max_f1_local < base_res[tr[0]][0]:
            max_f1_local = base_res[tr[0]][0]
            X_best = X
            tr_best = tr[0]  
            print ('\n\n\n ========= ', tr_best)
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X_best
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7351802204349122
Log Loss:  1.5392930230421562
Accuracy:  0.9554330116555959
AUC score:  0.9714555142429597
Num of comments missclassified:  7112
[[142596    754]
 [  6358   9872]]
             precision    recall  f1-score   support

          0    0.95732   0.99474   0.97567    143350
          1    0.92904   0.60826   0.73518     16230

avg / total    0.95444   0.95543   0.95121    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7337432457611328
Log Loss:  1.5464354244622378
Accuracy:  0.955226218824414
AUC score:  0.9708618178559386
Num of comments missclassified:  7145
[[142590    760]
 [  6385   9845]]
             precision    recall  f1-score   support

          0    0.95714   0.99470   0.97556    143350
          1    0.92834   0.60659   0.73374     16230

avg / total    0.95421   0.95523   0.95096    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ..

In [27]:
#Logit [('black_listed_words_regex_mapping',  0.7488 )]

# NBSVM

In [30]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

tr_scores = []
transformations = transformations_orig.copy()
pp_steps = ['replace_common_words_using_fuzzy']
max_f1 = 0.7968
tr_scores.append((pp_steps[0], max_f1))
transformations.remove(pp_steps)

new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_steps, train_data_copy)
X = train_data_copy['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
train_data_copy['comment_text'] = X
word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)

replace_common_words_using_fuzzy
replace_common_words_using_fuzzy
Done  50000
Done  100000
Done  150000
Done  200000
Done  250000
Done  300000
Done  350000
Done  400000
Done  450000
Done  500000


In [31]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    X_best = None
    tr_best = None
    max_f1_local = max_f1
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy, 'nbsvm')
        tr_iter_score.append((pp_steps[0][0], base_res))
        if max_f1_local < base_res[tr[0]][0]:
            max_f1_local = base_res[tr[0]][0]
            X_best = X
            tr_best = tr[0]  
            print ('\n\n\n ========= ', tr_best)
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X_best
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7936014372581537
Log Loss:  1.292991809051499
Accuracy:  0.962564231106655
AUC score:  0.9772505215294356
Num of comments missclassified:  5974
[[142121   1229]
 [  4745  11485]]
             precision    recall  f1-score   support

          0    0.96769   0.99143   0.97942    143350
          1    0.90333   0.70764   0.79360     16230

avg / total    0.96115   0.96256   0.96052    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  .....



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7904985499240436
Log Loss:  1.3133370312483605
Accuracy:  0.9619751848602581
AUC score:  0.9762176142953759
Num of comments missclassified:  6068
[[142064   1286]
 [  4782  11448]]
             precision    recall  f1-score   support

          0    0.96744   0.99103   0.97909    143350
          1    0.89901   0.70536   0.79050     16230

avg / total    0.96048   0.96198   0.95991    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  .

In [None]:
#NBSVM [('replace_common_words_using_fuzzy',  0.7968 )]

## XGBoost

In [32]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

tr_scores = []
transformations = transformations_orig.copy()
pp_steps = ['black_listed_words_regex_mapping']
max_f1 = 0.6252
tr_scores.append((pp_steps[0], max_f1))
transformations.remove(pp_steps)

new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_steps, train_data_copy)
X = train_data_copy['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
train_data_copy['comment_text'] = X
word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)


black_listed_words_regex_mapping
black_listed_words_regex_mapping
Done  50000
Done  100000
Done  150000
Done  200000
Done  250000
Done  300000
Done  350000
Done  400000
Done  450000
Done  500000


In [33]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_xgboost_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy)
        tr_iter_score.append((pp_steps[0][0], base_res))
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5756802353246528
Log Loss:  2.1230169442529148
Accuracy:  0.9385323975435518
AUC score:  0.9195745832761139
Num of comments missclassified:  9809
[[143117    233]
 [  9576   6654]]
             precision    recall  f1-score   support

          0    0.93729   0.99837   0.96687    143350
          1    0.96617   0.40998   0.57568     16230

avg / total    0.94022   0.93853   0.92708    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5697406715607489
Log Loss:  2.1437947518347635
Accuracy:  0.9379308183982955
AUC score:  0.9164988095138317
Num of comments missclassified:  9905
[[143117    233]
 [  9672   6558]]
             precision    recall  f1-score   support

          0    0.93670   0.99837   0.96655    143350
          1    0.96569   0.40407   0.56974     16230

avg / total    0.93965   0.93793   0.92620    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  .

In [34]:
#NBSVM [('black_listed_words_regex_mapping',  0.6252)]

# FastText

In [19]:
def call_fasttext_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data):
    combined_results = {}
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            texts_train = X_train_data.values
            texts_test  = X_test_data.values
            train = X.copy()
            X_train, X_test, embedding_matrix = preprocess_data_for_fasttext(texts_train, texts_test, train)
            res = call_fasttext_algorithm(X_train, y_train, X_test, y_test, embedding_matrix)
            results.append(res)
            k_fold_num += 1

    #         if k_fold_num ==1:
    #             break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        pickle.dump(combined_results, open("../data/individual_fasttext.pkl", "wb"))
    return combined_results, X

In [20]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

tr_scores = []
transformations = transformations_orig.copy()
pp_steps = ['convert_to_lower']
max_f1 = 0.8055
tr_scores.append((pp_steps[0], max_f1))
transformations.remove(pp_steps)

new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_steps, train_data_copy)
X = train_data_copy['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
train_data_copy['comment_text'] = X
word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)

convert_to_lower
convert_to_lower


In [22]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_fasttext_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy)
        tr_iter_score.append((pp_steps[0][0], base_res))
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

remove_non_alphabet_words


 *****Processing fold  0  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.801844241756039
Log Loss:  1.2836874477593945
Accuracy:  0.9628336884321343
AUC score:  0.979430545087716
Num of comments missclassified:  5931
[[141649   1701]
 [  4230  12000]]
             precision    recall  f1-score   support

Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.8046815423891782
Log Loss:  1.2750305039441256
Accuracy:  0.9630843464093245
AUC score:  0.9792061934078508
Num of comments missclassified:  5891
[[141554   1796]
 [  4095  12135]]
             precision    recall  f1-score   support

          0    0.97188   0.98747   0.97962    143350
          1    0.87108   0.74769



 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.7925780861021675
Log Loss:  1.3525152231079416
Accuracy:  0.9608409575134729
AUC score:  0.977410318105555
Num of comments missclassified:  6249
[[141392   1958]
 [  4291  11939]]
             precision    recall  f1-score   support

          0    0.97055   0.98634   0.97838    143350
          1    0.85911   0.73561   0.79258 

Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.7913798864016037
Log Loss:  1.3514325395553968
Accuracy:  0.9608722897606217
AUC score:  0.977445750730528
Num of comments missclassified:  6244
[[141493   1857]
 [  4387  11843]]
             precision    recall  f1-score   support

          0    0.96993   0.98705   0.97841    143350
          1    0.86445   0.72970   0.79138     16230

avg / total    0.95920   0.96087   0.95939

Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.7901652668653482
Log Loss:  1.3410425083711792
Accuracy:  0.9611730793332498
AUC score:  0.9774382598765008
Num of comments missclassified:  6196
[[141718   1632]
 [  4564  11666]]
             precision    recall  f1-score   support

          0    0.96880   0.98862   0.97861    143350
          1    0.87727   0.71879   0.79017     16230

avg / total    0.95949   0.96117   0.95944    15958

In [38]:
#FastText
[('convert_to_lower', 0.8055), 
('remove_whitespaces', 0.7924943340887881), 
('remove_leaky', 0.78793867120954), 
('trim_words_len', 0.7891367342812858), 
('strip_non_printable_chars', 0.7888652984194385), 
('replace_abbreviation_words', 0.7891574399135836), 
('replace_acronyms', 0.7905618424982578), 
('remove_stopwords', 0.7878088514797718), 
('remove_rare_words', 0.7775245739940819), 
('remove_non_alphanumeric', 0.7985856382802947), 
('remove_non_alphabet_words', 0.801844241756039), 
('remove_words_containing_non_alphabets', 0.7207078124440305), 
('black_listed_words_regex_mapping', 0.8046815423891782), 
('check_if_proper_name_place_or_ethnicity', 0.7935718260072661), 
('replace_profane_words_using_fuzzy', 0.7925780861021675), 
('replace_common_words_using_fuzzy', 0.7925047852513516), 
('lemmatize_english_words', 0.7913798864016037), 
('stemming_english_words', 0.7970322187214148), 
('extract_info_from_url', 0.7901652668653482)
]