In [1]:
from pandas import DataFrame, read_csv, concat, read_table, Series
from operator import itemgetter
import importlib
import re 
import string
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
import model_helper_functions
importlib.reload(model_helper_functions)
from model_helper_functions import *
import preprocess_helper_functions
importlib.reload(preprocess_helper_functions)
from preprocess_helper_functions import *
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import *
stemmer = PorterStemmer()
from urlextract import URLExtract
extractor = URLExtract()
rare_word_thresh = 1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
abbrev_words_dict = pickle.load(open('../data/abbreviated_words_map.csv', 'rb'))

In [3]:
def list_to_dict(mylist):
    mylist = [str(x).lower() for x in mylist]
    ret_dict = dict(zip(mylist, [1]*len(mylist))) #make dictionary for efficient search
    return ret_dict

In [4]:
all_badlist = read_table('../data/profane_list.txt', header=None, comment='#')
all_badlist = all_badlist.values.flatten().tolist()
all_badlist = sorted(set([x.lower() for x in all_badlist]))
all_badlist = [strip_non_printable_chars(str(x)).strip() for x in all_badlist]
#all_badlist_dict = dict(zip(all_badlist, [1] * len(all_badlist)))

#Read compiled acronyms and see if any acronyms are there
acronyms = read_csv('../data/compiled_acronyms_final.csv', encoding = 'latin-1')
acronyms = acronyms.apply(lambda x: x.astype(str).str.lower())
acronyms = acronyms.dropna()
acronyms = acronyms.drop_duplicates().reset_index(drop=True)
global acronyms_dict 
acronyms_dict = dict(acronyms.values)

from nltk.corpus import words
from nltk.corpus import wordnet
proper_words = words.words()
proper_words = sorted(set([x.lower() for x in proper_words]))
global proper_words_dict
proper_words_dict = list_to_dict(proper_words)

proper_words.extend(all_badlist)
global proper_words_with_profane_dict
proper_words_with_profane_dict = list_to_dict(proper_words)

In [5]:
extreme_profane = read_csv('../data/profane_x_common.csv', header=None)
extreme_profane = extreme_profane.values.flatten().tolist()
extreme_profane = [x.lower() for x in extreme_profane]

badlist3 = read_csv('../data/profane_list_common.csv', encoding='latin-1')
badlist_common = badlist3.values.flatten().tolist()
badlist_common = [x.lower() for x in badlist_common]

all_badlist_combined = sorted(set(all_badlist + badlist_common + extreme_profane))
all_badlist_combined_dict = dict(zip(all_badlist_combined, [1] * len(all_badlist_combined)))

profane_list_map = read_csv('../data/profane_list_common_mapping.csv', header=None)
profane_list_map = profane_list_map.apply(lambda x: x.astype(str).str.lower())
profane_list_map = dict(profane_list_map.values)

In [6]:
citynames = read_table('../data/citynames.txt', sep='\t', header=None, encoding='utf-8')
citynames = citynames.iloc[:, 1].str.lower().values
citynames = list_to_dict(citynames)

#Look for countries
countries = read_table('../data/countries.txt', header=None).values.flatten()
countries = list_to_dict(countries)

#Look for nationalities
nationalities = read_table('../data/nationalities.txt', header=None).values.flatten()
nationalities = list_to_dict(nationalities)

ethnicities = read_table('../data/ethnicities.txt', header=None).values.flatten()
ethnicities = list_to_dict(ethnicities)

#Look for persons name
person_names = []
for fn in ['../data/names.first.female.txt', 
           '../data/names.first.male.txt',
           '../data/names.last.txt',
           '../data/muslim_names.txt',
           '../data/englishnames.txt',
          ]:
    with open(fn, 'r') as ofd:
        for line in ofd.readlines():
            person_names.append(line.rstrip('\n').lower())

spanishnames = read_csv('../data/spanishnames.csv', encoding='utf-8')['nombre'].str.lower().str.split(' ')
spanishnames = list(set(spanishnames.apply(Series).unstack().values))
person_names.extend(spanishnames)

person_names_dict = list_to_dict(person_names)

### Read the data 

In [7]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
train_data = read_csv("../data/train.csv")
train_data.head()
train_data.fillna('NULL', inplace= True)
train_data['profane'] = train_data[classes].any(axis = 1).astype(int)

In [9]:
y = train_data['profane'].values

### Get word frequency table

In [10]:
# Get the Word Frequency table
tmp = train_data['comment_text'].str.cat(sep=' ')
words = tmp.split()
word_dist_dict = nltk.FreqDist(words)
word_dist_dict_most_common = word_dist_dict.most_common()

In [11]:
word_dist_dict_df = DataFrame(word_dist_dict_most_common, columns=['raw_word', 'freq'])
word_dist_dict_df.to_csv('../data/word_dist_train.csv', index=False)
pickle.dump(word_dist_dict, open('../data/word_dist_train.pkl', 'wb'))

In [12]:
pp_steps = [
#             ['raw'],  
#             ['convert_to_lower'],
#             ['remove_whitespaces'], 
#             ['remove_leaky'], 
#             ['trim_words_len'],
#             ['strip_non_printable_chars'],
#             ['replace_abbreviation_words'],
#             ['replace_acronyms'],
#             ['remove_stopwords'],
#             ['remove_rare_words'],
#             ['remove_non_alphanumeric'],
#             ['remove_non_alphabet_words'],
#             ['remove_words_containing_non_alphabets'],
#             ['black_listed_words_regex_mapping'],
#             ['check_if_proper_name_place_or_ethnicity'],
#             ['replace_profane_words_using_fuzzy'],
#             ['replace_common_words_using_fuzzy'],
#             ['lemmatize_english_words'],
#             ['stemming_english_words'],
#             ['extract_info_from_url'],
       
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_leaky'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'replace_abbreviation_words', 'replace_acronyms'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'remove_stopwords'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars',  'remove_rare_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'replace_abbreviation_words', 'replace_acronyms', 
#      'remove_stopwords', 'remove_rare_words'], 
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphanumeric'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphabet_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars', 'remove_non_alphanumeric', 
#      'remove_non_alphabet_words'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words', 'replace_acronyms',
#      'black_listed_words_regex_mapping'], 
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words', 'replace_acronyms',
#      'black_listed_words_regex_mapping', 'replace_profane_words_using_fuzzy', 
#      'replace_common_words_using_fuzzy'],
#     ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
#      'strip_non_printable_chars','replace_abbreviation_words',  'replace_acronyms', 
#      'remove_leaky',  'extract_info_from_url',  'black_listed_words_regex_mapping', 
#      'replace_profane_words_using_fuzzy', 'replace_common_words_using_fuzzy', 
#      'check_if_proper_name_place_or_ethnicity'],
    ['convert_to_lower', 'remove_whitespaces', 'trim_words_len', 
     'strip_non_printable_chars', 'replace_abbreviation_words',  'replace_acronyms', 
     'remove_leaky',  'remove_non_alphabet_words', 'remove_stopwords', 
     'stemming_english_words'],
    ['convert_to_lower', 'lemmatize_english_words'],
#     ['convert_to_lower','remove_non_alphabet_words', 
#      'black_listed_words_regex_mapping',  'replace_common_words_using_fuzzy', 
#      'stemming_english_words']
    ]

In [13]:
maxlen = 30

def get_corresponding_mapping(word_dist_dict_most_common, op_type):
    print(op_type)
    if op_type == 'raw':
        new_dict = dict([(x[0], x[0]) for x in word_dist_dict_most_common])
    elif op_type ==  'convert_to_lower': # remove white spaces 
        new_dict = convert_to_lower_from_dict(word_dist_dict_most_common)
    elif op_type ==  'remove_whitespaces': # remove white spaces 
        new_dict = remove_white_spaces_from_dict(word_dist_dict_most_common)
    elif op_type == 'remove_leaky':
        new_dict = remove_leaky_information_from_dict(word_dist_dict_most_common)
    elif op_type == 'extract_info_from_url':
        new_dict = extract_info_from_url(word_dist_dict_most_common, extractor)
    elif op_type == 'trim_words_len':
        new_dict = trim_words_len(word_dist_dict_most_common, maxlen)
    elif op_type == 'replace_abbreviation_words':
        new_dict = replace_abbreviation_words_from_dict(word_dist_dict_most_common, abbrev_words_dict)
    elif op_type == 'strip_non_printable_chars':
        new_dict = strip_non_printable_chars_from_dict(word_dist_dict_most_common)
    elif op_type == 'replace_acronyms':
        new_dict = replace_acronyms_from_dict(word_dist_dict_most_common, acronyms_dict, proper_words_with_profane_dict)
    elif op_type == 'remove_stopwords':
        new_dict = remove_stopwords_from_dict(word_dist_dict_most_common, stop_words_dict)
    elif op_type == 'remove_rare_words':
        new_dict = remove_rare_words_from_dict(word_dist_dict_most_common, word_dist_dict, rare_word_thresh)
    elif op_type == 'remove_non_alphanumeric':
        new_dict = remove_non_alphanumeric_from_dict(word_dist_dict_most_common)    
    elif op_type == 'remove_non_alphabet_words':
        new_dict = remove_non_alphabet_words(word_dist_dict_most_common)    
    elif op_type == 'remove_words_containing_non_alphabets':
        new_dict = remove_words_containing_non_alphabets_from_dict(word_dist_dict_most_common)    
    elif op_type == 'black_listed_words_regex_mapping':
        new_dict = black_listed_words_regex_mapping_from_dict(word_dist_dict_most_common, all_badlist, profane_list_map, extreme_profane)
    elif op_type == 'replace_profane_words_using_fuzzy':
        new_dict = replace_profane_words_using_fuzzy(word_dist_dict_most_common, proper_words_dict, extreme_profane, profane_list_map, badlist_common)
    elif op_type == 'check_if_proper_name_place_or_ethnicity':
        new_dict = check_if_proper_name_place_or_ethnicity_from_dict(word_dist_dict_most_common, proper_words_dict, citynames, countries, nationalities, \
                                              ethnicities, person_names_dict)
    elif op_type == 'replace_common_words_using_fuzzy':
        new_dict = replace_common_words_using_fuzzy(word_dist_dict_most_common, word_dist_dict_most_common, wordnet_lemmatizer, proper_words_dict)
    elif op_type == 'lemmatize_english_words':
        new_dict = lemmatize_english_words(word_dist_dict_most_common, wordnet_lemmatizer)
    elif op_type == 'stemming_english_words':
        new_dict = stemming_english_words(word_dist_dict_most_common, stemmer)
    else:
        print ("Error......")
        new_dict = 'Error .......'
    return new_dict

def get_new_distribution(X):
    tmp = X.str.cat(sep=' ')
    words = tmp.split()
    w_d_dict = nltk.FreqDist(words) #Word distribution dict
    w_d_most_common = w_d_dict.most_common()
    return w_d_most_common

# def get_corresponding_mapping_multiple(word_dist_dict_most_common, op_types):
#     curr_list = word_dist_dict_most_common.copy()
#     new_dict = {}
#     for op_type in op_types:
#         print (op_type)
#         tmp_map = get_corresponding_mapping(curr_list, op_type)
#         if not new_dict:
#             new_dict = tmp_map
#         else: 
#             new_dict = update_dict_with_next_level_val(new_dict, tmp_map)
#         curr_list = [(new_dict[x], 1) for x in new_dict]
#     return new_dict

def get_corresponding_mapping_multiple(word_dist_dict_most_common, op_types, train_data):
    curr_word_dist_dict_most_common = word_dist_dict_most_common.copy()
    comment_data = train_data['comment_text']
    for op_type in op_types:
        print (op_type)
        new_mapped_dict = get_corresponding_mapping(curr_word_dist_dict_most_common, op_type)
        comment_data = comment_data.apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        curr_word_dist_dict_most_common = get_new_distribution(comment_data)
    return comment_data

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words_dict = dict(zip(stop_words, [1] * len(stop_words)))

### Calling NB SVM algorithm

In [28]:
combined_results = pickle.load(open("../data/results_nbsvm.pkl", "rb"))

In [29]:
#Add the preprocessing step here 
import pickle
# combined_results = {} #Uncomment 
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_NB_SVM_algorithm(X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1
#         if k_fold_num ==2:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_nbsvm.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7934215527070448
Log Loss:  1.294073996550994
Accuracy:  0.9625328988595062
AUC score:  0.9773238556493344
Num of comments missclassified:  5979
[[142119   1231]
 [  4748  11482]]
             precision    recall  f1-score   support

          0    0.96767   0.99141   0.97940    143350
          1    0.90317   0.70746   0.79342     16230

avg / total    0.96111   0.96253   0.96048    159580

convert_to_lower
convert_to_lower
remove_whitespaces
remove_

remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
remove_non_alphanumeric
remove_non_alphanumeric
remove_non_alphabet_words
remove_non_alphabet_words


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7859110723626853
Log Loss:  1.3287034052796851
Accuracy:  0.9615302669507457
AUC score:  0.9762106108540447
Num of comments missclassified:  6139
[[142173   1177]
 [  4962  11268]]
             precision    recall  f1-score   support

          0    0.96628   0.99179   0.97887    143350
          1    0.90542   0.69427   0.78591     16

Done  50000
Done  100000
Done  150000
Done  200000
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7883614299996553
Log Loss:  1.3287042370454036
Accuracy:  0.9615302669507457
AUC score:  0.9767681574661072
Num of comments missclassified:  6139
[[142007   1343]
 [  4796  11434]]
             precision    recall  f1-score   support

          0    0.96733   0.99063   0.97884    143350
          1    0.89489   0.70450   0.78836     16230

avg / total    0.95996   0.96153   0.95947    159580



### Calling Logistic regression

In [30]:
combined_results = pickle.load(open("../data/results_logit.pkl", "rb"))

In [31]:
#Add the preprocessing step here 
# combined_results = {} #Uncomment 
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_logreg_algorithm(X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_logit.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7351133951513797
Log Loss:  1.5395094535271634
Accuracy:  0.9554267452061662
AUC score:  0.9715448633514437
Num of comments missclassified:  7113
[[142597    753]
 [  6360   9870]]
             precision    recall  f1-score   support

          0    0.95730   0.99475   0.97567    143350
          1    0.92912   0.60813   0.73511     16230

avg / total    0.95444   0.95543   0.95120    159580

convert_to_lower
convert_to_lower
remove_whitespaces
remove

remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
remove_non_alphanumeric
remove_non_alphanumeric
remove_non_alphabet_words
remove_non_alphabet_words


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7278044391121777
Log Loss:  1.571325386206043
Accuracy:  0.9545055771399925
AUC score:  0.9702170318930805
Num of comments missclassified:  7260
[[142614    736]
 [  6524   9706]]
             precision    recall  f1-score   support

          0    0.95626   0.99487   0.97518    143350
          1    0.92952   0.59803   0.72780     162

Done  50000
Done  100000
Done  150000
Done  200000
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7518747724790681
Log Loss:  1.4752289130227163
Accuracy:  0.9572878806868028
AUC score:  0.9738810422894987
Num of comments missclassified:  6816
[[142437    913]
 [  5903  10327]]
             precision    recall  f1-score   support

          0    0.96021   0.99363   0.97663    143350
          1    0.91877   0.63629   0.75187     16230

avg / total    0.95599   0.95729   0.95377    159580



### Call XGBoost Algorithm 

In [20]:
combined_results = pickle.load(open("../data/results_xgboost.pkl", "rb"))
combined_results.keys()

dict_keys(['raw', 'convert_to_lower', 'remove_whitespaces', 'remove_leaky', 'trim_words_len', 'strip_non_printable_chars', 'replace_abbreviation_words', 'replace_acronyms', 'remove_stopwords', 'remove_rare_words', 'remove_non_alphanumeric', 'remove_non_alphabet_words', 'remove_words_containing_non_alphabets', 'black_listed_words_regex_mapping', 'check_if_proper_name_place_or_ethnicity', 'replace_profane_words_using_fuzzy', 'replace_common_words_using_fuzzy', 'lemmatize_english_words', 'stemming_english_words', 'extract_info_from_url', 'convert_to_lower remove_whitespaces trim_words_len', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_leaky', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars replace_abbreviation_words replace_acronyms', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_stopwords', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_rare_w

In [22]:
#Add the preprocessing step here 
# combined_results = {} #Uncomment 
for pp_step in pp_steps[20:]:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    vocab = set(' '.join(X.values).split())
    vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train = vectorizer.fit_transform(X_train_data)
        X_test = vectorizer.transform(X_test_data)
        res = call_xgboost_algorithm(xgb, vectorizer, X_train, y_train, X_test, y_test)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/results_xgboost.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5734956461465147
Log Loss:  2.1308086271067452
Accuracy:  0.9383068053640807
AUC score:  0.919502285875283
Num of comments missclassified:  9845
[[143116    234]
 [  9611   6619]]
             precision    recall  f1-score   support

          0    0.93707   0.99837   0.96675    143350
          1    0.96585   0.40783   0.57350     16230

avg / total    0.94000   0.93831   0.92675    159580

convert_to_lower
convert_to_lower
remove_whitespaces
remove_

remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
remove_non_alphanumeric
remove_non_alphanumeric
remove_non_alphabet_words
remove_non_alphabet_words


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5723701326858035
Log Loss:  2.134488020511424
Accuracy:  0.938200275723775
AUC score:  0.9161101140068613
Num of comments missclassified:  9862
[[143118    232]
 [  9630   6600]]
             precision    recall  f1-score   support

          0    0.93695   0.99838   0.96669    143350
          1    0.96604   0.40665   0.57237     1623

Done  50000
Done  100000
Done  150000
Done  200000
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.6348413510747185
Log Loss:  1.9303904003526295
Accuracy:  0.9441095375360321
AUC score:  0.9327217975986543
Num of comments missclassified:  8919
[[142908    442]
 [  8477   7753]]
             precision    recall  f1-score   support

          0    0.94400   0.99692   0.96974    143350
          1    0.94606   0.47770   0.63484     16230

avg / total    0.94421   0.94411   0.93568    159580



# Fasttext Algorithm

In [32]:
combined_results = pickle.load(open("../data/individual_fasttext.pkl", "rb"))
combined_results.keys()

dict_keys(['raw', 'convert_to_lower', 'remove_whitespaces', 'remove_leaky', 'trim_words_len', 'strip_non_printable_chars', 'replace_abbreviation_words', 'replace_acronyms', 'remove_stopwords', 'remove_rare_words', 'remove_non_alphanumeric', 'replace_common_words_using_fuzzy', 'remove_non_alphabet_words', 'remove_words_containing_non_alphabets', 'black_listed_words_regex_mapping', 'check_if_proper_name_place_or_ethnicity', 'replace_profane_words_using_fuzzy', 'lemmatize_english_words', 'stemming_english_words', 'extract_info_from_url', 'convert_to_lower remove_whitespaces trim_words_len', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_leaky', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars replace_abbreviation_words replace_acronyms', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_stopwords', 'convert_to_lower remove_whitespaces trim_words_len strip_non_printable_chars remove_rare_w

In [68]:
#combined_results = {}
for pp_step in pp_steps:
    if len(pp_step) == 1:
        new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
    else:
        X = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step, train_data)
    results = []
    k_fold_num = 0
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
    for train_index, test_index in cv.split(X, y):
        print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
        X_train_data, X_test_data = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        texts_train = X_train_data.values
        texts_test  = X_test_data.values
        train = X.copy()
        X_train, X_test, embedding_matrix = preprocess_data_for_fasttext(texts_train, texts_test, train)
        res = call_fasttext_algorithm(X_train, y_train, X_test, y_test, embedding_matrix)
        results.append(res)
        k_fold_num += 1

#         if k_fold_num ==1:
#             break
    scores = extract_combined_results(results)
    combined_results[' '.join(pp_step)] = scores
    pickle.dump(combined_results, open("../data/individual_fasttext_part.pkl", "wb"))

convert_to_lower
convert_to_lower
remove_whitespaces
remove_whitespaces
trim_words_len
trim_words_len
strip_non_printable_chars
strip_non_printable_chars
replace_abbreviation_words
replace_abbreviation_words
replace_acronyms
replace_acronyms
remove_leaky
remove_leaky
remove_non_alphabet_words
remove_non_alphabet_words
remove_stopwords
remove_stopwords
stemming_english_words
stemming_english_words
Done  50000
Done  100000
Done  150000
Done  200000


 *****Processing fold  0  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ..

# <font color = 'red' > Greedy to select the best transformation sequence  </color>

In [26]:
#Add the preprocessing step here
def call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data):
    combined_results = {} #Uncomment 
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        vocab = set(' '.join(X.values).split())
        vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train = vectorizer.fit_transform(X_train_data)
            X_test = vectorizer.transform(X_test_data)
#             res = call_NB_SVM_algorithm(X_train, y_train, X_test, y_test)
            res = call_logreg_algorithm(X_train, y_train, X_test, y_test)
            results.append(res)
            k_fold_num += 1

#             if k_fold_num ==1:
#                 break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        import pickle
        pickle.dump(combined_results, open("../data/results_logit.pkl", "wb"))
    return combined_results, X

In [27]:
#Add the preprocessing step here 
#Add the preprocessing step here
def call_xgboost_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data):
    combined_results = {} #Uncomment 
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        vocab = set(' '.join(X.values).split())
        vectorizer = TfidfVectorizer(ngram_range=(1,4), min_df=1, vocabulary= vocab)
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            #print (len(train_index), len(test_index))
            #print("TRAIN:", train_index, "TEST:", test_index)
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            X_train = vectorizer.fit_transform(X_train_data)
            X_test = vectorizer.transform(X_test_data)
            res = call_xgboost_algorithm(xgb, vectorizer, X_train, y_train, X_test, y_test)
            results.append(res)
            k_fold_num += 1

    #         if k_fold_num ==1:
    #             break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        import pickle
        pickle.dump(combined_results, open("../data/results_xgboost_1.pkl", "wb"))
    return combined_results, X

In [36]:
transformations = [ ['raw'],  
                    ['convert_to_lower'],
                    ['remove_whitespaces'], 
                    ['remove_leaky'], 
                    ['replace_abbreviation_words'],
                    ['strip_non_printable_chars'],
                    ['replace_acronyms'],
                    ['remove_stopwords'],
                    ['remove_rare_words'],
                    ['remove_non_alphanumeric'],
                    ['remove_non_alphabet_words'],
                    ['remove_words_containing_non_alphabets'],
                    ['black_listed_words_regex_mapping'],
                    ['check_if_proper_name_place_or_ethnicity'],
                    ['replace_profane_words_using_fuzzy'],
                    ['replace_common_words_using_fuzzy'],
                    ['lemmatize_english_words'],
                    ['stemming_english_words'],
                    ['extract_info_from_url']
                  ]

In [31]:
pp_steps = [transformations[0]]
base_res, X = call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data)
#Get the f1_score
tr_scores = []
tr_scores.append((transformations[0][0], base_res[transformations[0][0]][0]))
max_f1 = base_res[transformations[0][0]][0]
#remove the raw transformation
transformations.remove(pp_steps[0])

raw


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7406912920931613
Log Loss:  1.513320733501024
Accuracy:  0.9561849855871664
AUC score:  0.9719737794320009
Num of comments missclassified:  6992
[[142602    748]
 [  6244   9986]]
             precision    recall  f1-score   support

          0    0.95805   0.99478   0.97607    143350
          1    0.93031   0.61528   0.74069     16230

avg / total    0.95523   0.95618   0.95213    159580



In [32]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

In [36]:
def get_words_dict_most_common(df):
    # Get the Word Frequency table
    tmp = df['comment_text'].str.cat(sep=' ')
    words = tmp.split()
    word_dist_dict = nltk.FreqDist(words)
    word_dist_dict_most_common = word_dist_dict.most_common()
    return word_dist_dict_most_common

In [34]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    X_best = None
    tr_best = None
    max_f1_local = max_f1
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_logit_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy)
        tr_iter_score.append((pp_steps[0][0], base_res))
        if max_f1_local < base_res[tr[0]][0]:
            max_f1_local = base_res[tr[0]][0]
            X_best = X
            tr_best = tr[0]  
            print ('\n\n\n ========= ', tr_best)
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X_best
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7351802204349122
Log Loss:  1.5392930230421562
Accuracy:  0.9554330116555959
AUC score:  0.9714555142429597
Num of comments missclassified:  7112
[[142596    754]
 [  6358   9872]]
             precision    recall  f1-score   support

          0    0.95732   0.99474   0.97567    143350
          1    0.92904   0.60826   0.73518     16230

avg / total    0.95444   0.95543   0.95121    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.6577154936193533
Log Loss:  1.8925165586461195
Accuracy:  0.9452061661862389
AUC score:  0.9462181083702385
Num of comments missclassified:  8744
[[142435    915]
 [  7829   8401]]
             precision    recall  f1-score   support

          0    0.94790   0.99362   0.97022    143350
          1    0.90178   0.51762   0.65772     16230

avg / total    0.94321   0.94521   0.93844    159580

black_listed_words_regex_mapping
Done  100000
Done  200000
Done  300000
Done  400000
Done  500000


 *****Processing fold  0  of  10  ......


 *****Processing f

[['convert_to_lower'], ['remove_whitespaces'], ['remove_leaky'], ['replace_abbreviation_words'], ['strip_non_printable_chars'], ['replace_acronyms'], ['remove_stopwords'], ['remove_rare_words'], ['remove_non_alphanumeric'], ['remove_non_alphabet_words'], ['remove_words_containing_non_alphabets'], ['check_if_proper_name_place_or_ethnicity'], ['replace_profane_words_using_fuzzy'], ['replace_common_words_using_fuzzy'], ['lemmatize_english_words'], ['stemming_english_words'], ['extract_info_from_url']]
*******************
[('raw', 0.7406912920931613), ('black_listed_words_regex_mapping', 0.7499725325032045)]
convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8 



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.745866098331741
Log Loss:  1.4968720717574673
Accuracy:  0.9566612357438276
AUC score:  0.9728205487433111
Num of comments missclassified:  6916
[[142515    835]
 [  6081  10149]]
             precision    recall  f1-score   support

          0    0.95908   0.99418   0.97631    143350
          1    0.92398   0.62532   0.74587     16230

avg / total    0.95551   0.95666   0.95287    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ..

[['convert_to_lower'], ['remove_whitespaces'], ['remove_leaky'], ['replace_abbreviation_words'], ['strip_non_printable_chars'], ['replace_acronyms'], ['remove_stopwords'], ['remove_rare_words'], ['remove_non_alphanumeric'], ['remove_non_alphabet_words'], ['remove_words_containing_non_alphabets'], ['check_if_proper_name_place_or_ethnicity'], ['replace_profane_words_using_fuzzy'], ['replace_common_words_using_fuzzy'], ['lemmatize_english_words'], ['extract_info_from_url']]
*******************
[('raw', 0.7406912920931613), ('black_listed_words_regex_mapping', 0.7499725325032045), ('stemming_english_words', 0.7514657150140199)]
convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 ****



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.75047369188165
Log Loss:  1.4821548639152438
Accuracy:  0.9570873543050508
AUC score:  0.9737794257685293
Num of comments missclassified:  6848
[[142434    916]
 [  5932  10298]]
             precision    recall  f1-score   support

          0    0.96002   0.99361   0.97653    143350
          1    0.91832   0.63450   0.75047     16230

avg / total    0.95578   0.95709   0.95353    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...

F1-score:  0.7523716061498201
Log Loss:  1.4745797067485211
Accuracy:  0.9573066800350921
AUC score:  0.9740172055822078
Num of comments missclassified:  6813
[[142417    933]
 [  5880  10350]]
             precision    recall  f1-score   support

          0    0.96035   0.99349   0.97664    143350
          1    0.91731   0.63771   0.75237     16230

avg / total    0.95597   0.95731   0.95383    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7523716061498201
Log Loss:  1.4745797067485211
Accuracy:  0.9573066800350921
AUC score:  0.9740172055822078
Num of comments missc



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.688047310360514
Log Loss:  1.7810530750805875
Accuracy:  0.9484333876425617
AUC score:  0.9541818040759995
Num of comments missclassified:  8229
[[142276   1074]
 [  7155   9075]]
             precision    recall  f1-score   support

          0    0.95212   0.99251   0.97189    143350
          1    0.89418   0.55915   0.68805     16230

avg / total    0.94623   0.94843   0.94303    159580

check_if_proper_name_place_or_ethnicity
Done  100000
Done  200000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Pr



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7533824222859009
Log Loss:  1.4715497150328753
Accuracy:  0.9573944103271087
AUC score:  0.9742113123157025
Num of comments missclassified:  6799
[[142396    954]
 [  5845  10385]]
             precision    recall  f1-score   support

          0    0.96057   0.99334   0.97668    143350
          1    0.91587   0.63986   0.75338     16230

avg / total    0.95602   0.95739   0.95397    159580

strip_non_printable_chars


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 ***



 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7538500561655253
Log Loss:  1.470251157176015
Accuracy:  0.9574320090236872
AUC score:  0.9743350238043507
Num of comments missclassified:  6793
[[142385    965]
 [  5828  10402]]
             precision    recall  f1-score   support

          0    0.96068   0.99327   0.97670    143350
          1    0.91511   0.64091   0.75385     16230

avg / total    0.95604   0.95743   0.95404    159580

extract_info_from_url
Done  50000
Done  100000
Done  150000
Done  200000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-



 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7450894487039066
Log Loss:  1.5111574207570486
Accuracy:  0.9562476500814638
AUC score:  0.9717271627917573
Num of comments missclassified:  6982
[[142394    956]
 [  6026  10204]]
             precision    recall  f1-score   support

          0    0.95940   0.99333   0.97607    143350
          1    0.91434   0.62871   0.74509     16230

avg / total    0.95482   0.95625   0.95258    159580

remove_non_alphanumeric


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****



 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7553657388975352
Log Loss:  1.4628923453134737
Accuracy:  0.9576450683042987
AUC score:  0.9742261822712873
Num of comments missclassified:  6759
[[142386    964]
 [  5795  10435]]
             precision    recall  f1-score   support

          0    0.96089   0.99328   0.97682    143350
          1    0.91543   0.64295   0.75537     16230

avg / total    0.95627   0.95765   0.95429    159580

remove_leaky


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing 

Done  150000


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.7553657388975352
Log Loss:  1.4628923453134737
Accuracy:  0.9576450683042987
AUC score:  0.9742261822712873
Num of comments missclassified:  6759
[[142386    964]
 [  5795  10435]]
             precision    recall  f1-score   support

          0    0.96089   0.99328   0.97682    143350
          1    0.91543   0.64295   0.75537     16230

avg / total    0.95627   0.95765   0.95429    159580

[('convert_to_lower', 0.7553657388975352), ('remove_whitespaces', 0.7553657388975352), ('remove_leaky', 0.7553657388975352), ('strip_non_printable_c

In [None]:
# NBSVM
[('raw', 0.7957122242836528), ('replace_common_words_using_fuzzy', 0.7963421342134214)]

#Regex

[('raw', 0.7406912920931613), ('black_listed_words_regex_mapping', 0.7499725325032045),
 ('stemming_english_words', 0.7514657150140199), ('replace_common_words_using_fuzzy', 0.7523716061498201),
 ('replace_profane_words_using_fuzzy', 0.7534003119219469), ('remove_rare_words', 0.7552609656271506), 
 ('lemmatize_english_words', 0.7553657388975352)]


## XGBoost

In [28]:
transformations = [ ['raw'],  
                    ['convert_to_lower'],
                    ['remove_whitespaces'], 
                    ['remove_leaky'], 
                    ['replace_abbreviation_words'],
                    ['strip_non_printable_chars'],
                    ['replace_acronyms'],
                    ['remove_stopwords'],
                    ['remove_rare_words'],
                    ['remove_non_alphanumeric'],
                    ['remove_non_alphabet_words'],
                    ['remove_words_containing_non_alphabets'],
                    ['black_listed_words_regex_mapping'],
                    ['check_if_proper_name_place_or_ethnicity'],
                    ['replace_profane_words_using_fuzzy'],
                    ['replace_common_words_using_fuzzy'],
                    ['lemmatize_english_words'],
                    ['stemming_english_words'],
                    ['extract_info_from_url']
                  ]

In [38]:
pp_steps = [transformations[0]]
base_res, X = call_xgboost_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data)
#Get the f1_score
tr_scores = []
tr_scores.append((transformations[0][0], base_res[transformations[0][0]][0]))
max_f1 = base_res[transformations[0][0]][0]
#remove the raw transformation
transformations.remove(pp_steps[0])

raw


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5726540161164544
Log Loss:  2.134920946619718
Accuracy:  0.9381877428249153
AUC score:  0.9196245355556603
Num of comments missclassified:  9864
[[143107    243]
 [  9621   6609]]
             precision    recall  f1-score   support

          0    0.93701   0.99830   0.96668    143350
          1    0.96454   0.40721   0.57265     16230

avg / total    0.93981   0.93819   0.92661    159580



In [39]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

In [40]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_xgboost_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy)
        tr_iter_score.append((pp_steps[0][0], base_res))
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5756802353246528
Log Loss:  2.1230169442529148
Accuracy:  0.9385323975435518
AUC score:  0.9195745832761139
Num of comments missclassified:  9809
[[143117    233]
 [  9576   6654]]
             precision    recall  f1-score   support

          0    0.93729   0.99837   0.96687    143350
          1    0.96617   0.40998   0.57568     16230

avg / total    0.94022   0.93853   0.92708    159580

remove_whitespaces


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.48238557558945916
Log Loss:  2.423212726179658
Accuracy:  0.9298408321844843
AUC score:  0.8866361176246325
Num of comments missclassified:  11196
[[143167    183]
 [ 11013   5217]]
             precision    recall  f1-score   support

          0    0.92857   0.99872   0.96237    143350
          1    0.96611   0.32144   0.48239     16230

avg / total    0.93239   0.92984   0.91355    159580

black_listed_words_regex_mapping
Done  100000
Done  200000
Done  300000
Done  400000
Done  500000


 *****Processing fold  0  of  10  ......


 *****Processing 

[['convert_to_lower'], ['remove_whitespaces'], ['remove_leaky'], ['replace_abbreviation_words'], ['strip_non_printable_chars'], ['replace_acronyms'], ['remove_stopwords'], ['remove_rare_words'], ['remove_non_alphanumeric'], ['remove_non_alphabet_words'], ['remove_words_containing_non_alphabets'], ['check_if_proper_name_place_or_ethnicity'], ['replace_profane_words_using_fuzzy'], ['replace_common_words_using_fuzzy'], ['lemmatize_english_words'], ['stemming_english_words'], ['extract_info_from_url']]
*******************
[('raw', 0.5726540161164544), ('black_listed_words_regex_mapping', 0.6275640497569817)]
convert_to_lower


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8 



 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ......


 *****Processing fold  3  of  10  ......


 *****Processing fold  4  of  10  ......


 *****Processing fold  5  of  10  ......


 *****Processing fold  6  of  10  ......


 *****Processing fold  7  of  10  ......


 *****Processing fold  8  of  10  ......


 *****Processing fold  9  of  10  ......
F1-score:  0.5701133452034568
Log Loss:  2.142496138860898
Accuracy:  0.937968417094874
AUC score:  0.9167192788699073
Num of comments missclassified:  9899
[[143117    233]
 [  9666   6564]]
             precision    recall  f1-score   support

          0    0.93673   0.99837   0.96657    143350
          1    0.96572   0.40444   0.57011     16230

avg / total    0.93968   0.93797   0.92625    159580

remove_words_containing_non_alphabets


 *****Processing fold  0  of  10  ......


 *****Processing fold  1  of  10  ......


 *****Processing fold  2  of  10  ...

In [None]:
#XGBoost
#[('raw', 0.5726540161164544), ('black_listed_words_regex_mapping', 0.6275640497569817)]

# FastText

In [None]:

def call_fasttext_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data):
    combined_results = {}
    for pp_step in pp_steps:
        if len(pp_step) == 1:
            new_mapped_dict = get_corresponding_mapping(word_dist_dict_most_common, pp_step[0])
        else:
            new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_step)
        X = train_data['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
        results = []
        k_fold_num = 0
        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
        for train_index, test_index in cv.split(X, y):
            print ("\n\n *****Processing fold ", k_fold_num, " of ", cv.n_splits, " ......")
            X_train_data, X_test_data = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            texts_train = X_train_data.values
            texts_test  = X_test_data.values
            train = X.copy()
            X_train, X_test, embedding_matrix = preprocess_data_for_fasttext(texts_train, texts_test, train)
            res = call_fasttext_algorithm(X_train, y_train, X_test, y_test, embedding_matrix)
            results.append(res)
            k_fold_num += 1

    #         if k_fold_num ==1:
    #             break
        scores = extract_combined_results(results)
        combined_results[' '.join(pp_step)] = scores
        pickle.dump(combined_results, open("../data/individual_fasttext.pkl", "wb"))
    return combined_results, X

In [25]:
transformations = [ 
                    ['raw'],  
#                     ['convert_to_lower'],
#                     ['remove_whitespaces'], 
#                     ['remove_leaky'], 
#                     ['replace_abbreviation_words'],
#                     ['strip_non_printable_chars'],
#                     ['replace_acronyms'],
#                     ['remove_stopwords'],
#                     ['remove_rare_words'],
#                     ['remove_non_alphanumeric'],
#                     ['remove_non_alphabet_words'],
#                     ['remove_words_containing_non_alphabets'],
#                     ['black_listed_words_regex_mapping'],
#                     ['check_if_proper_name_place_or_ethnicity'],
#                     ['replace_profane_words_using_fuzzy'],
#                     ['replace_common_words_using_fuzzy'],
#                     ['lemmatize_english_words'],
#                     ['stemming_english_words'],
#                     ['extract_info_from_url']
                  ]

In [26]:
# tr_scores = [
# ('raw', 0.7889988850221306), 
# ('convert_to_lower', 0.796382849014428), 
# ('remove_whitespaces',  0.788676182101062), 
# ('remove_leaky', 0.7953629698130656), 
# ('replace_abbreviation_words', 0.7906082380325877), 
# ('strip_non_printable_chars',  0.7873755274261603), 
# ('replace_acronyms', 0.7863236197365726), 
# ('remove_stopwords', 0.780157298015049), 
# ('remove_rare_words', 0.7765953852910374), 
# ('remove_non_alphanumeric',  0.7963456157502613), 
# ('remove_non_alphabet_words', 0.8008337816030123), 
# ('remove_words_containing_non_alphabets', 0.7182936479389972),
# ('black_listed_words_regex_mapping', 0.8001469753148278),
# ('check_if_proper_name_place_or_ethnicity', 0.8040536079751767), 
# ('replace_profane_words_using_fuzzy', 0.8038622842094997), 
# ('replace_common_words_using_fuzzy', 0.8066558387749014), 
# ('lemmatize_english_words',0.7952955287532485),
# ('stemming_english_words',0.806138554014574),
# ('extract_info_from_url',  0.7914496380978151)
# ]


tr_scores = [
 ('remove_non_alphabet_words', 0.8134880205081013),
 ('remove_non_alphanumeric', 0.8120045300113249),
 ('convert_to_lower', 0.8080052927555409),
 ('stemming_english_words', 0.806609947643979),
 ('remove_whitespaces', 0.8063916018869177),
 ('remove_leaky', 0.8051792030968432),
 ('replace_profane_words_using_fuzzy', 0.805108455942859),
 ('lemmatize_english_words', 0.8043908825405351),
 ('remove_stopwords', 0.8040699191234021),
 ('extract_info_from_url', 0.8032797858099063),
 ('black_listed_words_regex_mapping', 0.8031000563044414),
 ('strip_non_printable_chars', 0.8030237724213388),
 ('replace_abbreviation_words', 0.8023512123438649),
 ('replace_acronyms', 0.8018030139935414),
 ('check_if_proper_name_place_or_ethnicity', 0.801067907995619),
 ('remove_rare_words', 0.7990749557883281),
 ('remove_words_containing_non_alphabets', 0.7437213819897321)]

max_f1 = max( [x[1] for x in tr_scores])
max_f1

0.8134880205081013

In [30]:
pp_steps = [transformations[0]]
base_res, X = call_fasttext_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common,train_data)
#Get the f1_score
tr_scores = []
tr_scores.append((transformations[0][0], base_res[transformations[0][0]][0]))
max_f1 = base_res[transformations[0][0]][0]
#remove the raw transformation
transformations.remove(pp_steps[0])   

raw


 *****Processing fold  0  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.7825343051705849
Log Loss:  1.3754553864021253
Accuracy:  0.960176713873919
AUC score:  0.9776388035522672
Num of comments missclassified:  6355
[[141791   1559]
 [  4796  11434]]
             precision    recall  f1-score   support

          0    0.96

In [31]:
train_data_copy = train_data.copy()
word_dist_dict_most_common_copy = word_dist_dict_most_common

In [37]:
# pp_steps = ['replace_common_words_using_fuzzy']
# new_mapped_dict = get_corresponding_mapping_multiple(word_dist_dict_most_common, pp_steps)
# X = train_data_copy['comment_text'].apply(replace_words_from_a_mapping_no_check, args = [new_mapped_dict, 0])
# train_data_copy['comment_text'] = X
# word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)

In [39]:
#Loop through all the transformations to see which one gives the maximum boost to f1_score
while(True):
    tr_iter_score = []
    for tr in transformations:
        pp_steps = [tr]
        base_res, X = call_fasttext_to_recursive_transformation_Addition(pp_steps, word_dist_dict_most_common_copy,train_data_copy)
        tr_iter_score.append((pp_steps[0][0], base_res))
    key_neg_f1 = []
    for k in tr_iter_score:
        key_neg_f1.append((k[0], k[1][k[0]][0]))
    ordered = sorted(key_neg_f1,key=itemgetter(1), reverse=True)
    print(ordered)
    if max_f1 >= ordered[0][1]:
        print ('Any other transformation not needed')
        break
    # Add transformation into the    
    tr_scores.append(ordered[0])
    max_f1 = ordered[0][1]
    transformations.remove([ordered[0][0]])
    train_data_copy['comment_text'] = X
    word_dist_dict_most_common_copy = get_words_dict_most_common(train_data_copy)
    print(transformations)
    print ('*******************')
    print (tr_scores)

black_listed_words_regex_mapping
Done  100000
Done  200000
Done  300000


 *****Processing fold  0  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  1  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.8031000563044414
Log Loss:  1.2867182361663008
Accuracy:  0.9627459581401178
AUC score:  0.9783840910902978
Num of comments missclassified:  5945
[[141511   1839]
 [  4106  12124]]
    

Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  2  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  3  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  4  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  5  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  6  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  7  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  8  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3


 *****Processing fold  9  of  10  ......
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1-score:  0.806609947643979
Log Loss:  1.2791438255844705
Accuracy:  0.9629652838701591
AUC score:  0.9785092794308189
Num of comments missclassified:  5910
[[141345   2005]
 [  3905  12325]]
             precision    recall  f1-score   support

          0    0.97312   0.98601   0.97952    143350
          1    0.86008   0.75940   0.80661     16230

avg / total    0.96162   0.96297

In [41]:
z = tr_scores + [('stemming_english_words', 0.806609947643979), ('replace_profane_words_using_fuzzy', 0.805108455942859), ('lemmatize_english_words', 0.8043908825405351), ('extract_info_from_url', 0.8032797858099063), ('black_listed_words_regex_mapping', 0.8031000563044414), ('check_if_proper_name_place_or_ethnicity', 0.801067907995619)]

In [43]:
sorted(z,key=itemgetter(1), reverse=True)

[('remove_non_alphabet_words', 0.8134880205081013),
 ('remove_non_alphanumeric', 0.8120045300113249),
 ('convert_to_lower', 0.8080052927555409),
 ('stemming_english_words', 0.806609947643979),
 ('remove_whitespaces', 0.8063916018869177),
 ('remove_leaky', 0.8051792030968432),
 ('replace_profane_words_using_fuzzy', 0.805108455942859),
 ('lemmatize_english_words', 0.8043908825405351),
 ('remove_stopwords', 0.8040699191234021),
 ('extract_info_from_url', 0.8032797858099063),
 ('black_listed_words_regex_mapping', 0.8031000563044414),
 ('strip_non_printable_chars', 0.8030237724213388),
 ('replace_abbreviation_words', 0.8023512123438649),
 ('replace_acronyms', 0.8018030139935414),
 ('check_if_proper_name_place_or_ethnicity', 0.801067907995619),
 ('remove_rare_words', 0.7990749557883281),
 ('remove_words_containing_non_alphabets', 0.7437213819897321)]

In [None]:
#FastText
[('raw', 0.7825343051705849), ('replace_common_words_using_fuzzy', 0.8066558387749014), ('remove_non_alphabet_words', 0.8134880205081013)]
