# A sandbox to test code, post ideas, etc. 

#### Libraries

In [40]:
import pandas as pd
import numpy as np
import pickle

#from sklearn's...
from sklearn.feature_extraction.text import CountVectorizer #Convert a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import TfidfTransformer #Transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfVectorizer #Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB


*We have three datasets we are using; training (**train_ds**), testing (**test_ds**), and testing labels (**test_labels_ds**)*

#### Load training and test dataframes from file; preprocessing completed in 'text_preprocessing' notebook

In [10]:
file_object = open('clean_data.p', 'rb')
clean_data = pickle.load(file_object)
train_ds = clean_data[0]
test_ds = clean_data[1]

In [12]:
train_ds.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stic...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really try edit war guy constantly ...,0,0,0,0,0,0


In [13]:
test_ds.head(3)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule succesful youll ever whats ha...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland —


In [54]:
test_labels = pd.read_csv('test_labels.csv', header = 0)
test_labels.shape

(153164, 7)

In [55]:
test_labels.head(2)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1


Create a subsample of training data for quickened runtime

In [60]:
#subset of testing data; removed the id column

train_ds_subset = train_ds.loc[1:1000,'comment_text']
train_labels_subset = train_ds.loc[1:1000,'toxic']

#### Create a subsample of test data for quickened runtime

In [65]:
#subset of testing data; removed the id column

test_ds_subset = test_ds.loc[1:1000,'comment_text']
test_labels_subset = test_labels.loc[1:1000,'toxic']

#### Exploration

##### Get term count matrix along with terms array

In [33]:
corpus = train_ds['comment_text']
c1 = corpus.head(5)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(c1)
terms = vectorizer.get_feature_names_out()
terms

array(['accident', 'actual', 'appear', 'article', 'background', 'backlog',
       'cant', 'care', 'chance', 'closure', 'colour', 'constantly',
       'date', 'daww', 'delay', 'doll', 'dont', 'edit', 'eg', 'else',
       'etc', 'exact', 'explanation', 'fac', 'fan', 'first', 'form',
       'format', 'gas', 'guess', 'guy', 'hardcore', 'hero', 'hey', 'ie',
       'im', 'improvement', 'info', 'information', 'instead', 'january',
       'know', 'later', 'let', 'list', 'make', 'man', 'match', 'may',
       'metallica', 'need', 'new', 'noone', 'page', 'please',
       'preference', 'real', 'really', 'reference', 'relevant',
       'remember', 'remove', 'retire', 'revert', 'review', 'reviewer',
       'section', 'seem', 'seemingly', 'since', 'sir', 'statistic',
       'stick', 'style', 'subsection', 'suggestion', 'talk', 'template',
       'thank', 'thats', 'think', 'tidy', 'try', 'turn', 'type',
       'username', 'utc', 'vandalism', 'vote', 'want', 'war', 'werent',
       'wikipediagoodarticl

In [26]:
X = pd.DataFrame(X.toarray())
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,1,0,0,1,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


#### Get TFIDF matrix as dataframe

In [32]:
tfidfTransform = TfidfTransformer()
X_tfidf = tfidfTransform.fit_transform(X)
X_tfidf = pd.DataFrame(X_tfidf.toarray())
X_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208745,...,0.208745,0.0,0.208745,0.208745,0.0,0.0,0.208745,0.0,0.0,0.208745
1,0.0,0.0,0.0,0.0,0.317869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.317869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.222236,0.0,0.0,0.0,0.0,0.0,0.222236,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.222236,0.0,0.0,0.0,0.0
3,0.132673,0.0,0.132673,0.132673,0.0,0.132673,0.132673,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.132673,0.0,0.0,0.132673,0.132673,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428411,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Parameters for CountVectorizer
* max_df: ignore terms that have a document frquency strictly higher than the given threshold  
* min_df: ignore terms that have a document frequency strictly lower than the given threshold

In [35]:
vectorizer_params = {
    'vect__min_df': [1, 3, 10, 20], 
    'vect__max_df': [0.6, 0.8, 0.9, 1.0],
    'vect__norm': ['l1','l2']
}

In [36]:
pipe_CountVect = Pipeline([
    ('count', CountVectorizer()),
    ('clf', ComplementNB())
    ])

In [37]:
pipe_TfidfVect = Pipeline([
    ('vect', TfidfVectorizer()),
    ("clf", ComplementNB())
    ])

In [39]:
random_search = RandomizedSearchCV(
    estimator = pipe_TfidfVect,
    param_distributions = vectorizer_params,
    n_iter = 10,
    random_state = 123,
    n_jobs = 2,
    verbose = 2
)

In [61]:
random_search.fit(train_ds_subset,train_labels_subset)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                             ('clf', ComplementNB())]),
                   n_jobs=2,
                   param_distributions={'vect__max_df': [0.6, 0.8, 0.9, 1.0],
                                        'vect__min_df': [1, 3, 10, 20],
                                        'vect__norm': ['l1', 'l2']},
                   random_state=123, verbose=2)

In [63]:
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(vectorizer_params.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

vect__max_df: 0.8
vect__min_df: 1
vect__norm: l1


In [66]:
test_accuracy = random_search.score(test_ds_subset, test_labels_subset)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

Accuracy of the best parameters using the inner CV of the random search: 0.908
Accuracy on test set: 0.396
