In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')

[nltk_data] Downloading package punkt to /Users/jackperry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jackperry/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/jackperry/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/jackperry/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/jackperry/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
df = pd.read_csv('/Users/jackperry/Documents/GitHub/Toxic comment challenge/data/train-1.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Randomly sample 10,000 rows and then do some pre-processing, we only want to build a model to determine whether text is a threat or not.


In [4]:
df_sample = df.sample(n=10000, random_state=1)  # random_state for reproducibility
df_selected = df_sample[['id', 'comment_text', 'threat']]

In [5]:
df_selected[20:60]

Unnamed: 0,id,comment_text,threat
125418,9ed4497c64f3841d,"""\nNo problem making 2 requests - rather, supp...",0
126234,a32be17a2ab3770b,I await my lashings..lol.. maybe they will see...,0
43054,72db634dfe4d23b6,"You must forgive my laggard pace, Splash; been...",0
43409,73de4f62fb8f0de2,"Well then, LooseTheHotButtonS, when are you go...",0
61380,a44d0b87bceaa9fc,"""\n\n Thanks \n\nThanks for reverting my talk ...",0
116638,6f88181c6e4d01f8,Bullshit! \n\nThe section used to be a neat li...,0
114966,66ca321e423ed36c,Unfamiliar word section \n\nBy what standard a...,0
39841,6a5c9833ca7e8093,Extra song? (/doesn't exist) \n\nApparently th...,0
138963,e7ab2600518a1333,Shutters - Use of Socket Covers \n\nI have rev...,0
146754,31215789bd4b1cbe,I actually went ahead and added new figures an...,0


Count % of threats to determine viability of sample.

In [6]:
# Count the number of threats
threat_count = df_selected[df_selected['threat'] == 1].shape[0]

# Calculate the percentage
total_rows = df_selected.shape[0]
threat_percentage = (threat_count / total_rows) * 100

print(f"Percentage of threats in the sample: {threat_percentage}%")

Percentage of threats in the sample: 0.29%


Insufficient %, assume > 20% threat within sample. 
Count total threats in dataset.

In [7]:
# Count the number of threats
threat_count = df[df['threat'] == 1].shape[0]

# Calculate the percentage
total_rows = df.shape[0]
threat_percentage = (threat_count / total_rows) * 100

print(f"Percentage of threats in the total dataset: {threat_percentage}%")

Percentage of threats in the total dataset: 0.2995531769557125%


In [8]:
no_threats_in_whole_dataset = len(df)*(threat_percentage/100)
no_threats_in_whole_dataset

477.99999999999994

# Break out a smaller sample size of 5,000 and include all the threats for training... assuming insufficient threats for a sample of 10,000 (only 478 positive threats)

In [9]:
# Separate the dataset into threats and non-threats
threats = df[df['threat'] == 1]
non_threats = df[df['threat'] == 0]

# Sample 478 threats and 4522 non-threats
threats_sample = threats.sample(n=478, random_state=1)  # random_state for reproducibility
non_threats_sample = non_threats.sample(n=5000-478, random_state=1)

# Combine the samples
final_sample = pd.concat([threats_sample, non_threats_sample])

# Shuffle the final sample to mix threats and non-threats
final_sample = final_sample.sample(frac=1, random_state=1).reset_index(drop=True)

In [10]:
final_sample.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,d713e9b1f6bebf36,Critical Reception \nTop Gear is an acceptable...,0,0,0,0,0,0
1,e74c40f5a9e0cea1,I'm not very fond of you pegging me for editin...,1,0,0,0,0,0
2,fdbd26ddcd879334,"""\n The section """"#Dear Michael Bednarek"""" is ...",0,0,0,0,0,0
3,417b0c66b13ded41,"41, 6 July 2010 (UTC)\n16:",0,0,0,0,0,0
4,0b437857f7e5d02d,"Well, I've been trying to do some google searc...",0,0,0,0,0,0


OK so I need to drop the other headings...

In [11]:
final_sample = final_sample[['id', 'comment_text', 'threat']]

In [12]:
#have a look at 50 features..
final_sample[20:70]

Unnamed: 0,id,comment_text,threat
20,9cb2a703d610e1d5,I made no comment at all on the motivations of...,0
21,80accb846ff0f51b,Anyone editing: Anything I do I guarantee is ...,0
22,a859aa2190710f4a,"""\n\n Page Protection \n\nPer a request filed ...",0
23,aadbf549885edec9,Material from theraputic vaccines - too techni...,0
24,c06ac2ed8014bbf3,It is apparent to me that you are happy to wri...,0
25,5065264585cde12b,"Deletions\nHi, I think you got confused about ...",0
26,47e6223370c67857,"""\nIf by """"the carrier"""" you mean , then she i...",0
27,3641689b545453ee,Please explain how my use was abusive. That us...,0
28,39ddec5f32ddee74,"""\nI was joking. As can be seen from my user p...",0
29,a8491203345903b5,"""One sentence is """"undue weight."""" Please revi...",0


In [13]:
# Count the number of threats
threat_count = df['threat'].sum()
threat_count

478

OK now, we have a sample of 5,000 containing 478 threats and 4,522 non-threats...

go straight to vectorising data, will skip the tokenisation step here.

In [14]:
#import necessary tools?...
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# tokenisation

In [15]:
text = "Hello! This is an example of tokenization."
tokens = sent_tokenize(text)
print(tokens)

['Hello!', 'This is an example of tokenization.']


In [16]:
def nltk_tokenizer(text):
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(text)

In [17]:
#redfine the vectorizer to also contain the tokenise step ie. integrate above function.
vectorizer = TfidfVectorizer(tokenizer=nltk_tokenizer, stop_words='english', max_features=10000)

# modified to only use training data given, the testing will be done on the training data seperately (therefore no train_test_split required).

In [18]:
#have split the testing and training data into two parts, the testing data is 1/5 the size of the training data.
X_train = final_sample['comment_text']
Y_train = final_sample['threat']

Vectorize the data 

In [19]:
# Vectorize text data
X_train = vectorizer.fit_transform(X_train)

Y_train = Y_train

Y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: threat, dtype: int64

In [20]:
# Try Randomized Search to determine optimum hyper parameters.

# Define the parameter space
param_distributions = {
    'n_estimators':[148],
    'max_depth':[98],
    'min_samples_split': randint(1, 200),  # Number of trees in random forest
    'min_samples_leaf': randint(1, 200),      # Maximum depth of the trees
    # other parameters and their ranges
}

# Initialize the model
model = RandomForestClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions, n_iter=50, cv=5, random_state=42)

# Fit the model
random_search.fit(X_train, Y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)

Best parameters: {'max_depth': 98, 'min_samples_leaf': 103, 'min_samples_split': 180, 'n_estimators': 148}


To manually specify class weights in a Random Forest classifier in scikit-learn, you can set the class_weight parameter to a dictionary where keys are class labels and values are the weights you wish to assign to each class. This allows you to give more importance to the minority class by assigning it a higher weight compared to the majority class.

In [21]:
#Train a Logistic Regression model, using randomised search hyperparameters.
model = RandomForestClassifier(n_estimators=148, max_depth = 98, class_weight='balanced', random_state=42)
model.fit(X_train, Y_train)

# {'max_depth': 98, 'min_samples_leaf': 14, 'min_samples_split': 95, 'n_estimators': 148}

RandomForestClassifier(class_weight='balanced', max_depth=98, n_estimators=148,
                       random_state=42)

# Testing stage on unseen data.

In [22]:
df_test = pd.read_csv('/Users/jackperry/Documents/GitHub/Toxic comment challenge/data/test-1.csv')
df_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [23]:
df_testlabels = pd.read_csv('/Users/jackperry/Documents/GitHub/Toxic comment challenge/data/test_labels.csv')
df_testlabels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [24]:
#check for common column name - found 'id'
df_test.head(), df_testlabels.head()

(                 id                                       comment_text
 0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
 1  0000247867823ef7  == From RfC == \n\n The title is fine as it is...
 2  00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...
 3  00017563c3f7919a  :If you have a look back at the source, the in...
 4  00017695ad8997eb          I don't anonymously edit articles at all.,
                  id  toxic  severe_toxic  obscene  threat  insult  \
 0  00001cee341fdb12     -1            -1       -1      -1      -1   
 1  0000247867823ef7     -1            -1       -1      -1      -1   
 2  00013b17ad220c46     -1            -1       -1      -1      -1   
 3  00017563c3f7919a     -1            -1       -1      -1      -1   
 4  00017695ad8997eb     -1            -1       -1      -1      -1   
 
    identity_hate  
 0             -1  
 1             -1  
 2             -1  
 3             -1  
 4             -1  )

In [25]:
#join the dataframes together, pd.merge merges columns that share a common column, in this case: ID.
df_testset = pd.merge(df_test, df_testlabels, on="id")
df_testset.head()

print('length of testing data before removing redundant rows: ', len(df_testset))

length of testing data before removing redundant rows:  153164


In [26]:
# Remove rows where any of the columns have a value of -1
df_testset = df_testset[df_testset != -1].dropna()
df_testset.sample(10)

print('length of testing data after removing redundant rows: ', len(df_testset))

length of testing data after removing redundant rows:  63978


In [27]:
#break out the X and Y test data.
Y_test = df_testset[['threat']]
X_test = df_testset[['comment_text']]

print('X_test is length:', len(X_test))

print('Y_test is length:', len(Y_test))

X_test is length: 63978
Y_test is length: 63978


In [28]:
# Step 3: Vectorize the unseen data
X_test_vectorized = vectorizer.transform(X_test['comment_text'])

In [29]:
new_predictions = model.predict(X_test_vectorized)

# Display the first ten predictions
print(new_predictions[:10])

[0 0 0 0 0 0 0 0 1 0]


In [30]:
#put both test X and Y into the prediction algorithm..

In [31]:
print("Accuracy:", accuracy_score(Y_test, new_predictions)) 

Accuracy: 0.9926224639719904


In [32]:
class_report = classification_report(Y_test, new_predictions)
print(class_report)

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     63767
         1.0       0.26      0.65      0.37       211

    accuracy                           0.99     63978
   macro avg       0.63      0.82      0.68     63978
weighted avg       1.00      0.99      0.99     63978



Output was with the model class weights set to 2000... attempting to increase precision on class-1

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     63767
         1.0       0.07      0.48      0.12       211

    accuracy                           0.98     63978
   macro avg       0.53      0.73      0.55     63978
weighted avg       1.00      0.98      0.99     63978

Adjusted the model parameter to 'balanced' which means it adjusts weights inversely proportional to class frequencies..

Results...

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     63767
         1.0       0.29      0.59      0.39       211

    accuracy                           0.99     63978
   macro avg       0.64      0.79      0.69     63978
weighted avg       1.00      0.99      0.99     63978

Try: Adjust the model hyper parameters..

Model = RandomForestClassifier(n_estimators=400, class_weight='balanced', random_state=42)

Adjusted n_estimators to 400. parameter determines the number of trees in the forest. Generally, more trees increase the model's accuracy but also the computational cost.

Results...

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     63767
         1.0       0.28      0.60      0.38       211

    accuracy                           0.99     63978
   macro avg       0.64      0.80      0.69     63978
weighted avg       1.00      0.99      0.99     63978


Try n-estimators as 200...

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     63767
         1.0       0.29      0.60      0.39       211

    accuracy                           0.99     63978
   macro avg       0.64      0.80      0.69     63978
weighted avg       1.00      0.99      0.99     63978


Try n-estimators as 50...

             precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     63767
         1.0       0.29      0.53      0.37       211

    accuracy                           0.99     63978
   macro avg       0.64      0.76      0.69     63978
weighted avg       1.00      0.99      1.00     63978

Try n-estimators as 10...

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     63767
         1.0       0.22      0.36      0.27       211

    accuracy                           0.99     63978
   macro avg       0.61      0.68      0.63     63978
weighted avg       1.00      0.99      0.99     63978

OK, so will leave as n-estimators hyper-parameter at 50 for now. Now will try setting max_depth to 10.

              precision    recall  f1-score   support

         0.0       1.00      0.75      0.86     63767
         1.0       0.01      0.83      0.02       211

    accuracy                           0.75     63978
   macro avg       0.51      0.79      0.44     63978
weighted avg       1.00      0.75      0.86     63978

OK, leave at the default (max_depth = None which allows all the leaves to grow..)
Now try Gridsearch which tries a range of hyper_parameters automatically.. some adjustments required..

The parameter grid will search for the best max depth, min sample split and min samples leaf..

Define the parameter grid
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

Create the grid search model
grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100, random_state=42), param_grid, cv=5)

Fit the model
grid_search.fit(X_train, Y_train)

Print the best parameters
print("Best parameters:", grid_search.best_params_)

Grid search is too computationally exhaustive. Try randomized search.
Note: grid search tests all possible combinations of hyperparameters.
Randomized search:randomly samples a specified number of combinations from a given range for each hyperparameter

Randomised search with these parameters provided the following, note the improved recall.

Defined parameter space
param_distributions = {
    'n_estimators': randint(100, 500),  # Number of trees in random forest
    'max_depth': randint(10, 100),      # Maximum depth of the trees
    # other parameters and their ranges
}

the model
model = RandomForestClassifier()

initialise RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions, n_iter=10, cv=5, random_state=42)

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     63767
         1.0       0.23      0.63      0.34       211

    accuracy                           0.99     63978
   macro avg       0.61      0.81      0.67     63978
weighted avg       1.00      0.99      0.99     63978

Change to n_iter=40
Best parameters: {'max_depth': 98, 'n_estimators': 148}

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     63767
         1.0       0.26      0.65      0.37       211

    accuracy                           0.99     63978
   macro avg       0.63      0.82      0.68     63978
weighted avg       1.00      0.99      0.99     63978

Not bad, but will try randomised search for another parameter.

For:
param_distributions = {
    'min_samples_split': randint(1, 200),  # Number of trees in random forest
    'min_samples_leaf': randint(1, 200),      # Maximum depth of the trees
    # other parameters and their ranges
}

and n_iter = 40

Best parameters: {'min_samples_leaf': 103, 'min_samples_split': 180}

Try now...

              precision    recall  f1-score   support

         0.0       1.00      0.70      0.82     63767
         1.0       0.01      0.82      0.02       211

    accuracy                           0.70     63978
   macro avg       0.50      0.76      0.42     63978
weighted avg       1.00      0.70      0.82     63978

Too many leafs makes the model overly simplistic...
hyperparameter tuning can sometimes yield non-intuitive results, and it's often a matter of trial and error to find the best parameters for your specific dataset and problem.

Now focus on tuning specific hyperparameters while keeping others fixed, based on prior finding.

param_distributions = {
    'n_estimators':[148],
    'max_depth':[98],
    'min_samples_split': randint(1, 200),  # Number of trees in random forest
    'min_samples_leaf': randint(1, 200),      # Maximum depth of the trees
    # other parameters and their ranges
}

Best parameters: {'max_depth': 98, 'min_samples_leaf': 14, 'min_samples_split': 95, 'n_estimators': 148}

At this point, randomised search doesn't seem to be helpful.

              precision    recall  f1-score   support

         0.0       1.00      0.81      0.90     63767
         1.0       0.02      0.90      0.03       211

    accuracy                           0.81     63978
   macro avg       0.51      0.85      0.46     63978
weighted avg       1.00      0.81      0.89     63978


So back to default leaf settings of 1. 
" While this might be good for capturing a lot of detail and variance from the training data, it can also lead to overfitting, where the model becomes too tailored to the training data and performs poorly on unseen data."

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     63767
         1.0       0.26      0.65      0.37       211

    accuracy                           0.99     63978
   macro avg       0.63      0.82      0.68     63978
weighted avg       1.00      0.99      0.99     63978



# test on my own text.

In [33]:
def make_prediction(sample_text):
    sample_vectorized = vectorizer.transform([sample_text])
    sample_prediction = model.predict(sample_vectorized)# Display the prediction
    print("Prediction for the given text:")
    print(sample_prediction[0])  # This will display the class label (0 or 1 in binary classification)

In [34]:
make_prediction('sometimes you have to kill them')

Prediction for the given text:
1


# #save model as pickl file

In [35]:
import pickle

In [36]:
#serialise the model and save to file

with open('model.plk','wb') as file:
    pickle.dump(model, file)

In [39]:
#save the vectoriser...

with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)