# A sandbox to test code, post ideas, etc. 

#### Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

#from sklearn's...
from sklearn.feature_extraction.text import CountVectorizer #Convert a collection of text documents to a matrix of token counts.
from sklearn.feature_extraction.text import TfidfTransformer #Transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfVectorizer #Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB


*We have three datasets we are using; training (**train_ds**), testing (**test_ds**), and testing labels (**test_labels_ds**)*

#### Load training and test dataframes from file; preprocessing completed in 'text_preprocessing' notebook

In [2]:
file_object = open('clean_data.p', 'rb')
clean_data = pickle.load(file_object)
train_ds = clean_data[0]
test_ds = clean_data[1]

In [3]:
train_ds.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stic...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really try edit war guy constantly ...,0,0,0,0,0,0


In [4]:
test_ds.head(3)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule succesful youll ever whats ha...
1,0000247867823ef7,rfc title fine imo
2,00013b17ad220c46,source zawe ashton lapland —


In [5]:
test_labels = pd.read_csv('test_labels.csv', header = 0)
test_labels.shape

(153164, 7)

In [6]:
test_labels.head(2)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1


## Subsets of data

Subset1: Create a subsample of data for quickened runtime with only 'toxic' as labels(Subset)

In [7]:
#subset of testing data; removed the id column
train_ds_subset = train_ds.loc[1:1000,'comment_text']
train_labels_subset = train_ds.loc[1:1000,'toxic']

#subset of testing data; removed the id column
test_ds_subset = test_ds.loc[1:1000,'comment_text']
test_labels_subset = test_labels.loc[1:1000,'toxic']

Subset2: Create a subsample of data with full training set and test set instances with -1's were dropped. Only used 'toxic' labels

In [8]:
#subset of testing data; removed the id column
train_ds_subset2 = train_ds.loc[:,'comment_text']
train_labels_subset2 = train_ds.loc[:,'toxic']

#remove -1s
non_null_data = test_labels[test_labels.loc[:,'toxic'] != -1]
#subset of testing data; removed the id column
test_ds_subset2 =test_ds.loc[non_null_data.index]
test_ds_subset2 = test_ds_subset2.loc[:,'comment_text']
test_labels_subset2 = non_null_data.loc[:,'toxic']
print(test_ds_subset2.size)
print(test_labels_subset2.size)

63978
63978


In [9]:
train_ds.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit make username hardcore metall...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stic...,0,0,0,0,0,0


In [10]:
train_ds.loc[:,'toxic':'identity_hate'].sum(axis =1).unique()

array([0, 4, 1, 3, 2, 5, 6], dtype=int64)

In [None]:
#Could convert the comment labels to one label

In [11]:
train_ds[(train_ds['toxic'] == 1) & (train_ds['identity_hate'] == 1)]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
42,001810bf8c45bf5f,gay antisemmitian archangel white tiger meow g...,1,0,1,0,1,1
105,00472b8e2d38d1ea,pair jewhating weiner nazi schmuck,1,0,1,0,1,1
176,006b94add72ed61c,think fagget get oife burn hell hate sorry can...,1,0,1,1,1,1
218,008e0818dde894fb,kill nigger hard others say include racist som...,1,0,1,0,1,1
238,0097dd5c29bf7a15,u r tw fuck u gay boyu r smellyfuck ur mum poopie,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...
159281,fb726deec64157bd,lol youre gay never know good feel fuck woman as,1,1,1,0,1,1
159336,fc3efa2f6f025f6d,oh fuck pansy jew would whine bnai brith beat ...,1,0,1,0,1,1
159400,fd052883fa6a8697,shalom semite get fuck kill son bitch dont lea...,1,1,1,1,1,1
159449,fdce660ddcd6d7ca,think gay fag,1,0,0,0,0,1


### Sample how we did it in homework just on 5 comments as corpus

##### Get term count matrix along with terms array

In [12]:
corpus = train_ds['comment_text']
c1 = corpus.head(5)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(c1)
terms = vectorizer.get_feature_names_out()
terms

array(['accident', 'actual', 'appear', 'article', 'background', 'backlog',
       'cant', 'care', 'chance', 'closure', 'colour', 'constantly',
       'date', 'daww', 'delay', 'doll', 'dont', 'edit', 'eg', 'else',
       'etc', 'exact', 'explanation', 'fac', 'fan', 'first', 'form',
       'format', 'gas', 'guess', 'guy', 'hardcore', 'hero', 'hey', 'ie',
       'im', 'improvement', 'info', 'information', 'instead', 'january',
       'know', 'later', 'let', 'list', 'make', 'man', 'match', 'may',
       'metallica', 'need', 'new', 'noone', 'page', 'please',
       'preference', 'real', 'really', 'reference', 'relevant',
       'remember', 'remove', 'retire', 'revert', 'review', 'reviewer',
       'section', 'seem', 'seemingly', 'since', 'sir', 'statistic',
       'stick', 'style', 'subsection', 'suggestion', 'talk', 'template',
       'thank', 'thats', 'think', 'tidy', 'try', 'turn', 'type',
       'username', 'utc', 'vandalism', 'vote', 'want', 'war', 'werent',
       'wikipediagoodarticl

In [13]:
X = pd.DataFrame(X.toarray())
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,1,0,0,1,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,1,0,1,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


#### Get TFIDF matrix as dataframe

In [14]:
tfidfTransform = TfidfTransformer()
X_tfidf = tfidfTransform.fit_transform(X)
X_tfidf = pd.DataFrame(X_tfidf.toarray())
X_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208745,...,0.208745,0.0,0.208745,0.208745,0.0,0.0,0.208745,0.0,0.0,0.208745
1,0.0,0.0,0.0,0.0,0.317869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.317869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.222236,0.0,0.0,0.0,0.0,0.0,0.222236,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.222236,0.0,0.0,0.0,0.0
3,0.132673,0.0,0.132673,0.132673,0.0,0.132673,0.132673,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.132673,0.0,0.0,0.132673,0.132673,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428411,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using Pipelines for analysis

#### Parameter dictionaries


* max_df: ignore terms that have a document frquency strictly higher than the given threshold  
* min_df: ignore terms that have a document frequency strictly lower than the given threshold

In [15]:
# parameters for CountVectorizer()
countvect_params = {
    'count__min_df': [1, 3, 10, 20], 
    'count__max_df': [0.6, 0.8, 0.9, 1.0]
    
}

# parameters for TfidfVectorizer()
vectorizer_params = {
    'vect__min_df': [1, 3, 10, 20], 
    'vect__max_df': [0.6, 0.8, 0.9, 1.0],
    'vect__norm': ['l1','l2']
}

#### Pipelines

In [16]:
# pipeline using term counts and complimentary Naive Bayes
pipe_CountVect_CompNB = Pipeline([
    ('count', CountVectorizer()),
    ('clf', ComplementNB())
    ])

# pipeline using Tfidf and complimentary Naive Bayes
pipe_TfidfVect_CompNB = Pipeline([
    ('vect', TfidfVectorizer()),
    ("clf", ComplementNB())
    ])

### Subset 2 run; using Term Counts, full instances, minus -1 test; using only 'toxic' comments

#### Grid Search/ Randomized Search using TF_IDF

In [17]:
random_search = RandomizedSearchCV(
    estimator = pipe_CountVect_CompNB,
    param_distributions = countvect_params,
    n_iter = 10,
    random_state = 123,
    n_jobs = 2,
    verbose = 2
)

In [152]:
random_search.fit(train_ds_subset2,train_labels_subset2)

best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(countvect_params.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

test_accuracy = random_search.score(test_ds_subset2, test_labels_subset2)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
count__max_df: 0.8
count__min_df: 1
Accuracy of the best parameters using the inner CV of the random search: 0.936
Accuracy on test set: 0.888


In [157]:
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(countvect_params.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

test_accuracy = random_search.score(test_ds_subset2, test_labels_subset2)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

count__max_df: 0.8
count__min_df: 1
Accuracy of the best parameters using the inner CV of the random search: 0.936
Accuracy on test set: 0.888


### Subset 2 run; using TFIDF, full instances, minus -1 test; using only 'toxic' comments

#### Grid Search/ Randomized Search using TF_IDF

In [39]:
random_search = RandomizedSearchCV(
    estimator = pipe_TfidfVect,
    param_distributions = vectorizer_params,
    n_iter = 10,
    random_state = 123,
    n_jobs = 2,
    verbose = 2
)

### Reference dictionaries

In [18]:
#dictionary of different subsets of the data
subsets = {
    'subset1': [train_ds_subset, train_labels_subset, test_ds_subset, test_labels_subset],
        # filter: only 1000 instances per df
        # filter: only 'toxic' labels used
    'subset2': [train_ds_subset2, train_labels_subset2, test_ds_subset2, test_labels_subset2]
        # filter: labels with -1 were removed from test set
        # filter: only 'toxic' labels used
}

#list of parameter dictionaries
parameters = [
    countvect_params,
    vectorizer_params
    ]

#list of different pipelines
pipelines = [
    pipe_CountVect_CompNB,
    pipe_TfidfVect_CompNB
    ]

#pipeline with parameter dictionary
pipe_parm_dict = {
    'CountVect_CompNB': [pipe_CountVect_CompNB, countvect_params],
    'TfidfVect_CompNB': [pipe_TfidfVect_CompNB, vectorizer_params]
}

In [19]:
def run_on_subset(pipe_params, subset):
    'perform the pipeline(transform curpus to matrix and use to make model, use on test data) using the parameter dictionary on the data'

    pipe, parameters = pipe_params
    train_data, train_labels, test_data, test_labels = subset
    
    random_search = RandomizedSearchCV(
        estimator = pipe,
        param_distributions = parameters,
        n_iter = 3,
        random_state = 123,
        n_jobs = 2,
        verbose = 2   
    )
    print('Pipeline = {}'.format(pipe))
    random_search.fit(train_data,train_labels)

    best_parameters = random_search.best_estimator_.get_params()
    print('Best parameters = ')
    for param_name in sorted(parameters.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    test_accuracy = random_search.score(test_data, test_labels)
    print(
        "Accuracy of the best parameters using the inner CV of "
        f"the random search: {random_search.best_score_:.3f}"
    )
    print(f"Accuracy on test set: {test_accuracy:.3f}")
    return
    

In [20]:
run_on_subset(pipe_parm_dict['CountVect_CompNB'], subsets['subset1'])

Pipeline = Pipeline(steps=[('count', CountVectorizer()), ('clf', ComplementNB())])
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters = 
count__max_df: 0.8
count__min_df: 1
Accuracy of the best parameters using the inner CV of the random search: 0.904
Accuracy on test set: 0.390


In [21]:
run_on_subset(pipe_parm_dict['TfidfVect_CompNB'], subsets['subset1'])

Pipeline = Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', ComplementNB())])
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters = 
vect__max_df: 0.6
vect__min_df: 10
vect__norm: l2
Accuracy of the best parameters using the inner CV of the random search: 0.808
Accuracy on test set: 0.332


In [22]:
run_on_subset(pipe_parm_dict['CountVect_CompNB'], subsets['subset2'])

Pipeline = Pipeline(steps=[('count', CountVectorizer()), ('clf', ComplementNB())])
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters = 
count__max_df: 0.8
count__min_df: 1
Accuracy of the best parameters using the inner CV of the random search: 0.936
Accuracy on test set: 0.888
