In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv', encoding='ISO-8859-1')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,id,text,label,type,model_wrong,db.model_preds,status,round,split,annotator
0,1,10000,It should be legal for a husband to rape his wife,hate,notgiven,True,0.97457,dynabench entry,1,train,gnzuctwed3
1,2,10001,dalits are lowlife scum,hate,notgiven,False,0.08233,dynabench entry,1,test,cagnluiznm
2,3,10002,dalits are lowlives,hate,notgiven,True,0.92319,dynabench entry,1,train,cagnluiznm
3,4,10003,It was a better world when women dare not ques...,hate,notgiven,True,0.99006,dynabench entry,1,test,gnzuctwed3
4,5,10004,I don't work this hard so that those immigrant...,hate,notgiven,True,0.98836,dynabench entry,1,train,cagnluiznm
...,...,...,...,...,...,...,...,...,...,...,...
40618,40736,9995,we should forcibly remove all the foreigners f...,hate,notgiven,True,0.86945,dynabench entry,1,train,cagnluiznm
40619,40737,9996,we should forcibly remove all the immigrants f...,hate,notgiven,True,0.50039,dynabench entry,1,train,cagnluiznm
40620,40738,9997,dalits are unworthy of the Gods love,hate,notgiven,True,0.97652,dynabench entry,1,train,cagnluiznm
40621,40739,9998,dalits are worthless,hate,notgiven,True,0.94326,dynabench entry,1,train,cagnluiznm


In [4]:
df=df[['text','label']]

In [5]:
df

Unnamed: 0,text,label
0,It should be legal for a husband to rape his wife,hate
1,dalits are lowlife scum,hate
2,dalits are lowlives,hate
3,It was a better world when women dare not ques...,hate
4,I don't work this hard so that those immigrant...,hate
...,...,...
40618,we should forcibly remove all the foreigners f...,hate
40619,we should forcibly remove all the immigrants f...,hate
40620,dalits are unworthy of the Gods love,hate
40621,dalits are worthless,hate


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize



In [8]:
class TextNormalizer(BaseEstimator, TransformerMixin):
    def fit (self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token.lower()
                                 for token in word_tokenize(X_copy[i])])
        return X_copy

In [9]:
norm= TextNormalizer()

In [10]:
norm.fit_transform(['Ana apple a day keeps the doctor away!'])

['ana apple a day keeps the doctor away !']

In [11]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    
    def fit(self, X, y=None, **fit_params):
        self.general_freq = FreqDist()
        for document in X:
            tokens = word_tokenize(document)
            freq = FreqDist(tokens)
            self.general_freq.update(freq)
        self.hapaxes = self.general_freq.hapaxes()
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token for token in word_tokenize(X[i])
                                 if token not in self.hapaxes and
                                 token not in self.stop_words])
        return X_copy

In [12]:
stop_words=stopwords.words('english')

In [13]:
word_extractor = WordExtractor(stop_words)

In [14]:
corpus = [
    'John is a preaty boy',
    'Ann likes John',
    'Ann likes cherry',
    'Cherry is red'
]

In [15]:
word_extractor.fit_transform(corpus)

['John', 'Ann likes John', 'Ann likes', '']

In [16]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer):
        self.stemmer = stemmer
    
    def fit(self, X, y=None, **fit_tranform):
        return self
    
    def transform(self, X, y=None, **fit_tranform):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([self.stemmer.stem(token) 
                                  for token in word_tokenize(X_copy[i])])
        return X_copy

In [17]:
porter_stemmer = PorterStemmer()

In [18]:
apply_stemmer = ApplyStemmer(porter_stemmer)

In [19]:
apply_stemmer.fit_transform(['Ana apple a day keeps the doctor away!'])

['ana appl a day keep the doctor away !']

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
pipe = Pipeline([
    ("norm", TextNormalizer()),
    ("extractor", WordExtractor(stop_words)),
    ("stemmer", ApplyStemmer(PorterStemmer())),
    ("vectorizer", CountVectorizer()),
    ("logic",LogisticRegression())
])

In [25]:
X= df['text'].values
y=df['label'].values

In [26]:
X_train, X_test, y_train, y_test=train_test_split(X,y)

In [27]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('norm', TextNormalizer()),
                ('extractor',
                 WordExtractor(stop_words=['i', 'me', 'my', 'myself', 'we',
                                           'our', 'ours', 'ourselves', 'you',
                                           "you're", "you've", "you'll",
                                           "you'd", 'your', 'yours', 'yourself',
                                           'yourselves', 'he', 'him', 'his',
                                           'himself', 'she', "she's", 'her',
                                           'hers', 'herself', 'it', "it's",
                                           'its', 'itself', ...])),
                ('stemmer', ApplyStemmer(stemmer=<PorterStemmer>)),
                ('vectorizer', CountVectorizer()),
                ('logic', LogisticRegression())])

In [28]:
y_pred= pipe.predict(X_test)

In [29]:
accuracy_score(y_pred, y_test)

0.7291256400157542

In [30]:
import pickle

In [31]:
pickle.dump(pipe, open("nlp_pipe.pkl", 'wb'))