# BBC Text MultiClass Classification

In [None]:
# load packages
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
from sklearn import decomposition, ensemble
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas, xgboost, numpy, string
import pandas as pd

In [2]:
# load the dataset
# trainDF = pd.read_csv('../input/bbc-text.csv') # encoding = "latin"
other_stop_w = pd.read_csv('words_shared_by_all.csv')
import re
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r",", " ", string) 
    string = re.sub(r"!", " ", string) 
    string = re.sub(r"\(", " ", string) 
    string = re.sub(r"\)", " ", string) 
    string = re.sub(r"\?", " ", string) 
    string = re.sub(r"\s{2,}", " ", string)   
    string = re.sub(" \d+", " ", string)
    return  string.strip().lower()

TRAIN_FILEPATH = "../Translated/cleaned/train.csv"
TEST_FILEPATH = "../Translated/cleaned/test.csv"
SS_FILEPATH = "../data/SampleSubmission.csv"
VECTORS_FILEPATH = ""
trainDF = pd.read_csv(TRAIN_FILEPATH)
test = pd.read_csv(TEST_FILEPATH)
ss = pd.read_csv(SS_FILEPATH)
trainDF["Text"] =trainDF.Text.apply(lambda x: clean_str(x))
test["Text"] =test.Text.apply(lambda x: clean_str(x))
import tqdm
stopw = [item for sublist in other_stop_w.values.tolist() for item in sublist]
trainDF['Text'].apply(lambda x: [item for item in x.split() if item not in stopw]) 
test['Text'].apply(lambda x: [item for item in x.split() if item not in stopw]) 

0      [abambo, odzikhweza, akuchuluka, kafukufuku, a...
1      [ambuye, ziyaye, ayamikira, aphunzitsi, tilito...
2      [anatcheleza, akundiopseza, gogo, wanga, akund...
3      [ulova, wafika, posauzana, adatenga, digiri, u...
4      [dzombe, kukoma, kuyambira, makedzana, panthaw...
                             ...                        
615    [kanyongolo, wapempha, oyimira, milandu, atsat...
616    [amandimenya, zikomo, gogo, ndine, mtsikana, z...
617    [apolisi, athotha, gulu, myp, asilikali, gulu,...
618    [mwambo, ukwati, chitonga, mtundu, wina, uliwo...
619    [mwapasa, autsa, mapiri, kusamvana, pakati, ap...
Name: Text, Length: 620, dtype: object

In [3]:
# trainDF.head(10)
# [other_stop_w.values.tolist()]
# pd.read_csv(TEST_FILEPATH).Text[0]
# trainDF

In [4]:
# trainDF.shape

In [5]:
# trainDF['Label'].unique()

In [6]:
# trainDF['Label'].value_counts()

In [7]:
# sns.set(rc={'figure.figsize':(10,10)})
# sns.countplot(trainDF['Label'])

## Data preparation

In [10]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['Text'], trainDF['Label'],random_state = 0)

train_labels = train_y
valid_labels = valid_y
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
encoder.fit(trainDF['Label'])
train_y = encoder.transform(train_y)
valid_y = encoder.transform(valid_y)

In [11]:
# train_x

## Feature Extraction

### Count Vectors

In [12]:
# Count Vectors as features
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(pd.concat([trainDF['Text'], test['Text']],axis = 0, ignore_index = True))

# transform the training and validation data using count vectorizer object
xtest_count = count_vect.transform(test['Text'])
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [13]:
# # plot the train features
# pca = PCA(n_components=2).fit(xtrain_count.toarray())
# data2D = pca.transform(xtrain_count.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=train_labels.tolist(),size=train_labels.tolist(),palette="husl")

In [14]:
# # plot the validation features
# pca = PCA(n_components=2).fit(xvalid_count.toarray())
# data2D = pca.transform(xvalid_count.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=valid_labels.tolist(),size=valid_labels.tolist(),palette="husl")

###  TF-IDF Vectors

 TF-IDF Vectors as features
 
 a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents
 
 b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores  of N-grams
 
 c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus

### word level tf-idf

In [15]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1800)
tfidf_vect.fit(pd.concat([trainDF['Text'], test['Text']],axis = 0, ignore_index = True))

xtest_tfidf =  tfidf_vect.transform(test['Text'])

xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [16]:
# plot the train features
# pca = PCA(n_components=2).fit(xtrain_tfidf.toarray())
# data2D = pca.transform(xtrain_tfidf.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=train_labels.tolist(),size=train_labels.tolist(),palette="husl")

In [17]:
# plot the validation features
# pca = PCA(n_components=2).fit(xvalid_tfidf.toarray())
# data2D = pca.transform(xvalid_tfidf.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=valid_labels.tolist(),size=valid_labels.tolist(),palette="husl")
# test_x

### ngram level tf-idf 

In [18]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,4), max_features=1800)
tfidf_vect_ngram.fit(pd.concat([trainDF['Text'], test['Text']],axis = 0, ignore_index = True))
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test['Text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [19]:
# # plot the train features
# pca = PCA(n_components=2).fit(xtrain_tfidf_ngram.toarray())
# data2D = pca.transform(xtrain_tfidf_ngram.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=train_labels.tolist(),size=train_labels.tolist(),palette="husl")

In [20]:
# plot the validation features
# pca = PCA(n_components=2).fit(xvalid_tfidf_ngram.toarray())
# data2D = pca.transform(xvalid_tfidf_ngram.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=valid_labels.tolist(),size=valid_labels.tolist(),palette="husl")

### characters level tf-idf

In [21]:
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=1800)
tfidf_vect_ngram_chars.fit(pd.concat([trainDF['Text'], test['Text']],axis = 0, ignore_index = True))

xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test['Text']) 

xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [22]:
# plot the train features
# pca = PCA(n_components=2).fit(xtrain_tfidf_ngram_chars.toarray())
# data2D = pca.transform(xtrain_tfidf_ngram_chars.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=train_labels.tolist(),size=train_labels.tolist(),palette="husl")

In [23]:
# plot the validation features
# pca = PCA(n_components=2).fit(xvalid_tfidf_ngram_chars.toarray())
# data2D = pca.transform(xvalid_tfidf_ngram_chars.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=valid_labels.tolist(),size=valid_labels.tolist(),palette="husl")

### HashingVectorizer

In [24]:
# getting train features
hash_vectorizer = HashingVectorizer(n_features=1800)
hash_vectorizer.fit(pd.concat([trainDF['Text'], test['Text']],axis = 0, ignore_index = True))

xtest_hash_vectorizer =  hash_vectorizer.transform(test['Text'])

xtrain_hash_vectorizer =  hash_vectorizer.transform(train_x) 
xvalid_hash_vectorizer =  hash_vectorizer.transform(valid_x)

In [25]:
# plot the train features
# pca = PCA(n_components=2).fit(xtrain_hash_vectorizer.toarray())
# data2D = pca.transform(xtrain_hash_vectorizer.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=train_labels.tolist(),size=train_labels.tolist(),palette="husl")

In [26]:
# # plot the validation features
# pca = PCA(n_components=2).fit(xvalid_hash_vectorizer.toarray())
# data2D = pca.transform(xvalid_hash_vectorizer.toarray())
# cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
# ax = sns.scatterplot(data2D[:,0], data2D[:,1],
# hue=valid_labels.tolist(),size=valid_labels.tolist(),palette="husl")

## Model Building

In [27]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return classifier, metrics.accuracy_score(predictions, valid_y)

### Naive Bayes

In [28]:
# Naive Bayes on Count Vectors

classifierCV, accuracy = train_model(naive_bayes.ComplementNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
classifierW, accuracy = train_model(naive_bayes.ComplementNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
classifierNgram, accuracy = train_model(naive_bayes.ComplementNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
classifierChar, accuracy = train_model(naive_bayes.ComplementNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.5933147632311978
NB, WordLevel TF-IDF:  0.6016713091922006
NB, N-Gram Vectors:  0.520891364902507
NB, CharLevel Vectors:  0.5738161559888579


In [31]:
from scipy import stats
def Voting(model1,model2, model3, model4,model5,model6,model7,model8,
           model11,model12, model13, model14,model15,model16,model17,model18,
           x_model1, x_model2, x_model3, x_model4, y=None ,hard = True):
    if hard :
        pred1 = model1.predict(x_model1)
        pred2 = model2.predict(x_model2)
        pred3 = model3.predict(x_model3)
        pred4 = model4.predict(x_model4)
        pred5 = model5.predict(x_model1)
        pred6 = model6.predict(x_model2)
        pred7 = model7.predict(x_model3)
        pred8 = model8.predict(x_model4)
        pred11 = model11.predict(x_model1)
        pred12 = model12.predict(x_model2)
        pred13 = model13.predict(x_model3)
        pred14 = model14.predict(x_model4)
        pred15 = model15.predict(x_model1)
        pred16 = model16.predict(x_model2)
        pred17 = model17.predict(x_model3)
        pred18 = model18.predict(x_model4)
        return stats.mode(([pred1,pred2, pred3 ,pred4,pred5,pred6, pred7 ,pred8,pred11,pred12, pred13 ,pred14,pred15,pred16, pred17 ,pred18])).mode.reshape(-1,1)
    else:
        pred1 = model1.predict_proba(x_model1)
        pred2 = model2.predict_proba(x_model2)
        pred3 = model3.predict_proba(x_model3)
        pred4 = model4.predict_proba(x_model4)
        pred5 = model5.predict_proba(x_model1)
        pred6 = model6.predict_proba(x_model2)
        pred7 = model7.predict_proba(x_model3)
        pred8 = model8.predict_proba(x_model4)
        pred11 = model11.predict_proba(x_model1)
        pred12 = model12.predict_proba(x_model2)
        pred13 = model13.predict_proba(x_model3)
        pred14 = model14.predict_proba(x_model4)
        pred15 = model15.predict_proba(x_model1)
        pred16 = model16.predict_proba(x_model2)
        pred17 = model17.predict_proba(x_model3)
        pred18 = model18.predict_proba(x_model4)
        res = np.concatenate((pred1,pred2,pred3,pred4,pred5,pred6, pred7 ,pred8, pred11,pred12, pred13 ,pred14,pred15,pred16, pred17 ,pred18),axis = 1)
        return res

In [41]:
# encoder.inverse_transform(classifierCV.predict_proba(xtrain_count).argmax(axis = 1))
import numpy as np
train_preds = Voting(classifierCV,classifierW, classifierNgram, classifierChar,
               SGDClassifierCV,SGDClassifierW,SGDClassifierNgram,SGDClassifierChar,
               modelCV,modelW,modelNgram,modelChar,
               RFCV,RFW,RFNgram,RFChar,
               
               xtrain_count, xtrain_tfidf,xtrain_tfidf_ngram, xtrain_tfidf_ngram_chars,
               y=None ,hard = False)
val_preds = Voting(classifierCV,classifierW, classifierNgram, classifierChar,
               SGDClassifierCV,SGDClassifierW,SGDClassifierNgram,SGDClassifierChar,
               modelCV,modelW,modelNgram,modelChar,
               RFCV,RFW,RFNgram,RFChar,
               
               xvalid_count, xvalid_tfidf,xvalid_tfidf_ngram, xvalid_tfidf_ngram_chars,
               y=None ,hard = False)

In [89]:
test_preds = Voting(classifierCV,classifierW, classifierNgram, classifierChar,
               SGDClassifierCV,SGDClassifierW,SGDClassifierNgram,SGDClassifierChar,
               modelCV,modelW,modelNgram,modelChar,
               RFCV,RFW,RFNgram,RFChar,
               
               xtest_count, xtest_tfidf,xtest_tfidf_ngram, xtest_tfidf_ngram_chars,
               y=None ,hard = False)

In [115]:
sub_preds = model.predict(new_test)

In [120]:
# lb.inverse_transform(sub_preds)
test['Label'] = lb.inverse_transform(sub_preds)
sub = test[['ID','Label']]
sub.to_csv('stack_sim.csv', index = False)

In [105]:
test_preds[0,15::20]

array([0.02665007, 0.06146957, 0.07237813, 0.05469247, 0.05      ,
       0.        , 0.        , 0.23054317, 0.68944676, 0.20993159,
       0.18384075, 0.15935537, 0.185     , 0.209     , 0.135     ,
       0.147     ])

In [108]:
new_train = train_preds[:,140:]
new_valid = val_preds[:,140:]
new_test = test_preds[:,140:]

In [107]:
new_train.shape

(1077, 180)

In [43]:
train_preds.shape[1]

320

In [109]:
#Super model

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
layers = keras.layers
models = keras.models


# This code was tested with TensorFlow v1.8
# print("You have TensorFlow version", tf.__version__)
# Build the model
from keras import backend as K 

# Do some code, e.g. train and save model

K.clear_session()
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

model = models.Sequential()
model.add(layers.Dense(128, input_shape=(180,)))
# model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.5))
# model.add(layers.Dense(128, input_shape=(320,)))
model.add(layers.BatchNormalization())
# model.add(layers.Activation('relu'))
# model.add(layers.Dropout(0.5))
model.add(layers.Dense(64))
# model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
# model.add(layers.Dropout(0.5))

# model.add(layers.Dense(512))
# # model.add(layers.BatchNormalization())
# model.add(layers.Activation('relu'))
# model.add(layers.Dense(128))
# # model.add(layers.BatchNormalization())
# model.add(layers.Activation('relu'))

# model.add(layers.Dropout(drop_ratio))
model.add(layers.Dense(20))
model.add(layers.Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(trainDF['Label'])
train_bin = lb.transform(train_labels)
valid_bin = lb.transform(valid_labels)


In [110]:
# train_preds

In [111]:
history = model.fit(new_train, train_bin,
                    batch_size=16,
                    epochs=10,
                    verbose=1,
                    validation_split=0.5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [113]:
model.evaluate(new_valid,valid_bin)



[1.6310241222381592, 0.5961002707481384]

In [None]:
# (encoder.inverse_transform(classifierNgram.predict_proba(xtrain_tfidf_ngram).argmax(axis = 1)) == encoder.inverse_transform(classifierCV.predict_proba(xtrain_count).argmax(axis = 1))).sum()/xtrain_count.shape[0]
# (preds>19).sum()

# metrics.accuracy_score((np.argmax(,axis=1)), valid_y)


In [None]:
super_model = ensemble.RandomForestClassifier(n_estimators=1000)
# super_model = xgboost.XGBClassifier()
super_model.fit(preds,train_y)

preds_s = super_model.predict(Voting(classifierCV,classifierW, classifierNgram, classifierChar,
                                     SGDClassifierCV,SGDClassifierW,SGDClassifierNgram,SGDClassifierChar,xvalid_count, xvalid_tfidf,
                       xvalid_tfidf_ngram, xvalid_tfidf_ngram_chars, y=None ,hard = True))

metrics.accuracy_score(preds_s, valid_y)

In [None]:
encoder.inverse_transform(classifierW.predict_proba(xtrain_tfidf).argmax(axis = 1))

In [None]:
encoder.inverse_transform(train_y)

In [33]:
# Naive Bayes on Count Vectors

SGDClassifierCV, accuracy = train_model(linear_model.SGDClassifier(loss = 'modified_huber', penalty = 'l1',random_state = 0), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
SGDClassifierW,accuracy = train_model(linear_model.SGDClassifier(loss = 'modified_huber', penalty = 'l1',random_state = 0), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
SGDClassifierNgram,accuracy = train_model(linear_model.SGDClassifier(loss = 'modified_huber', penalty = 'l1',random_state = 0), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
SGDClassifierChar, accuracy = train_model(linear_model.SGDClassifier(loss = 'modified_huber', penalty = 'l1',random_state = 0), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.5710306406685237
NB, WordLevel TF-IDF:  0.5236768802228412
NB, N-Gram Vectors:  0.45125348189415043
NB, CharLevel Vectors:  0.5682451253481894


### Linear Classifier

In [34]:
# Linear Classifier on Count Vectors
modelCV, accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
modelW, accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
modelNgram, accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
modelChar, accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

# Linear Classifier on Hash Vectors
modelHash, accuracy = train_model(linear_model.LogisticRegression(solver="lbfgs",multi_class="auto",max_iter=4000), xtrain_hash_vectorizer, train_y, xvalid_hash_vectorizer)
print("LR, Hash Vectors: ", accuracy)

LR, Count Vectors:  0.5905292479108635
LR, WordLevel TF-IDF:  0.5821727019498607
LR, N-Gram Vectors:  0.4986072423398329
LR, CharLevel Vectors:  0.4986072423398329
LR, Hash Vectors:  0.5264623955431755


### RandomForestClassifier

In [36]:
# RF on Count Vectors
n_estimators = 1000
RFCV, accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=n_estimators), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
RFW, accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=n_estimators), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

# RF on Ngram Level TF IDF Vectors
RFNgram, accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=n_estimators), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("RF, N-Gram Vectors: ", accuracy)

# RF on Character Level TF IDF Vectors
RFChar, accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=n_estimators), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("RF, CharLevel Vectors: ", accuracy)

# RF on Hash Vectors
# accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=n_estimators), xtrain_hash_vectorizer, train_y, xvalid_hash_vectorizer)
# print("RF, Hash Vectors: ", accuracy)

RF, Count Vectors:  0.520891364902507
RF, WordLevel TF-IDF:  0.5710306406685237
RF, N-Gram Vectors:  0.45403899721448465
RF, CharLevel Vectors:  0.5626740947075209


### Extreme Gradient Boosting

In [None]:
# Extreme Gradient Boosting on Count Vector
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extreme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extreme Gradient Boosting on Ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("Xgb, N-Gram Vectors: ", accuracy)

# Extreme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

# Extreme Gradient Boosting on Hash Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_hash_vectorizer, train_y, xvalid_hash_vectorizer)
print("Xgb, Hash Vectors: ", accuracy)