In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import re

from utils import load_to_pd, load_augmented_df

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Malth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
np.random.seed(17) 

In [4]:
# Load in the data:
dataset = 'hate'
df_train = load_to_pd('train', dataset)
df_test = load_to_pd('test', dataset)
df_val = load_to_pd('val', dataset)
# Load offensive dataset
df_offensive = load_augmented_df('offensive_sentences_bigram', dataset)

# Combine train and test
df_all = pd.concat([df_train, df_test])

## Tokenize

In [5]:
tk = TweetTokenizer()
porter = PorterStemmer()
english_stopwords = stopwords.words('english')

def preprocess(tweet_text):
    # Lowercase
    tweet_text = tweet_text.lower()

    # Remove punctuation using regex
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text)

    # Remove stopwords from string
    tweet_text = ' '.join([word for word in tweet_text.split() if word not in english_stopwords])

    # Stem the tokens
    stemmed = [porter.stem(word) for word in tweet_text.split(' ')]

    return ' '.join(stemmed)

def tokenize(tweet_text):
    return tk.tokenize(tweet_text)

In [None]:
# Test the above functions
tokenize(preprocess("Hello @user. This is a very LONG test sentence with. and ! ? #maga"))

In [None]:
# Create pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(stop_words=None, preprocessor=preprocess, tokenizer=tokenize,)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier()),
        # ("clf", MultinomialNB()),
    ]
)

# Fit the pipeline for testing purposes
# pipeline.fit(df_train['text'], df_train['label'])

# print("Pipeline with default arguments:")
# print("Pipeline:", [name for name, _ in pipeline.steps])
# print("Fitting model to training data...")
# pipeline.fit(df_train['text'], df_train['label'])
# print("Model fitted!")

# print("Evaluating model...")
# print(f"Model score: {pipeline.score(df_test['text'], df_test['label'])}")
# print("Model evaluated!")

In [None]:
# Using grid search to find the optimal parameters for the pipeline components
parameters = {
    'vect__max_df': (0.7, 1.0),
    'vect__min_df': (0, 0.05),
    'vect__max_features': (None, 5000, 10000),
    'tfidf__use_idf': (True, False),
    # 'clf__loss': ('hinge', 'log'), # loss function of sgd classifier
    # 'clf__alpha': (0.00001, 0.001),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=3)
print("Performing grid search...")
grid_search.fit(df_train['text'], df_train['label'])
print("Grid search complete!")
print(f"Best score: {grid_search.best_score_}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

In [None]:
''' 
Below are the best parameters found for the pipeline using the hate dataset:
    Best score: 0.7584444444444445
    Best parameters set:
    clf__alpha: 1e-05
    clf__loss: log
    tfidf__use_idf: False
    vect__max_df: 1.0
    vect__max_features: 10000
    vect__min_df: 0

For the sentiment is is: 
    Best score: 0.6477912967225693
    Best parameters set:
    tfidf__use_idf: True
    vect__max_df: 1.0
    vect__max_features: 10000
    vect__min_df: 0
'''

In [6]:
# A function that takes in a model and outputs the performance
def performance(model, X, y):
    y_pred = model.predict(X)
    if dataset == 'hate':
        target_names = ['not hate', 'hate']
    elif dataset == 'sentiment':
        target_names = ['negative', 'neutral', 'positive']
    print(classification_report(y, y_pred, target_names=target_names))
    return model.score(X, y)
    
performance(grid_search, df_val['text'], df_val['label'])

NameError: name 'grid_search' is not defined

In [None]:
#predicting hate speech

#not hate example
not_hate_string = "This is a test sentence that is not hate."
grid_search.predict([not_hate_string])

#hate example
hate_string = "all immigrants should rot in jail #kill"
grid_search.predict([hate_string])

## Testing with new offensive sentences

In [7]:
# Train the pipeline model with the best found parameters
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(stop_words=None, preprocessor=preprocess, tokenizer=tokenize, max_df=1.0, min_df=0, max_features=10000)),
        ("tfidf", TfidfTransformer(use_idf=False)),
        ("clf", SGDClassifier(alpha=1e-05, loss='log')),
    ]
)

model = pipeline.fit(df_train['text'], df_train['label'])


In [8]:
performance(model, df_val['text'], df_val['label'])

              precision    recall  f1-score   support

    not hate       0.75      0.71      0.73       573
        hate       0.64      0.68      0.66       427

    accuracy                           0.70      1000
   macro avg       0.69      0.70      0.69      1000
weighted avg       0.70      0.70      0.70      1000



0.698

In [9]:
# Generate labels for offensive sentences
df_offensive['label'] = model.predict(df_offensive['text'])

In [None]:
# Print 10 random sentences from the offensive dataset
df_offensive[df_offensive['label'] == 1].sample(10)

In [10]:
# Retrain the model with the offensive dataset

# Concat the offensive dataset with the training dataset
df_all_train = pd.concat([df_train, df_offensive])

model = pipeline.fit(df_all_train['text'], df_all_train['label'])

In [11]:
# Check new performance
performance(model, df_val['text'], df_val['label'])

              precision    recall  f1-score   support

    not hate       0.76      0.71      0.73       573
        hate       0.64      0.70      0.67       427

    accuracy                           0.70      1000
   macro avg       0.70      0.70      0.70      1000
weighted avg       0.71      0.70      0.70      1000



0.703

### Result of adding augmented data
After adding the new augmented bigram data and training a new model with this data, we saw a tiny increase in accuracy (0.698 before and 0.703 with). Also the recall for not-hate stayed the same and precision increased by 0.01.

After adding trigram augmented data accuracy score increased from 0.698 to 0.701. For none-hate recall is improved by 0.07, and precision decreased by 0.03. This could be because of the now broader non hateful training labels.