In [1]:
# Importing libraries
import nltk
import pandas
import numpy
import string
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Reading the train data set
data_set = pandas.read_csv("hate_speech_train.csv") 
text = data_set.iloc[:,0]
labels = data_set.iloc[:,1]

In [3]:
# Reading the test data set
test_ds = pandas.read_csv("hate_speech_test.csv") 
test_text = test_ds.iloc[:,0]

In [4]:
# Exploring the data set

# print("data set shape :",data_set.shape)
# print("data set columns :",list(data_set.columns))

# Checking if data set has null values
# print(data_set.isnull().sum())

# filt_0 = (data_set['labels'] == 0)
# print(filt_0)
# print("no. of label 0 rows ",data_set.loc[filt_0].shape[0])

# filt_1 = (data_set['labels'] == 1)
# print(filt_1)
# print("no. of label 1 rows ",data_set.loc[filt_1].shape[0])

# Printing the data set
# data_set.head()

In [5]:
# string.punctuation

In [6]:
def remove_punct(txt):
    no_punct_txt = []
    for lv in txt:
        if(lv not in string.punctuation):
            no_punct_txt.append(lv)
    return "".join(no_punct_txt)

data_set['no_punct_txt'] = data_set['text'].apply(lambda x : remove_punct(x))
# print(data_set.shape)
# print(data_set.head())
test_ds['no_punct_txt'] = test_ds['text'].apply(lambda x : remove_punct(x))

In [7]:
def tokenize(txt):
    tokens = re.split('\W+',txt)
    return tokens

data_set['tokenized_txt'] = data_set['no_punct_txt'].apply(lambda x: tokenize(x.lower()))
# print(data_set.head())
test_ds['tokenized_txt'] = test_ds['no_punct_txt'].apply(lambda x: tokenize(x.lower()))

In [8]:
stop_words = nltk.corpus.stopwords.words('english')
# print(stop_words[:179])

In [9]:
def remove_stpwrds(txt):
    no_stpwrds = []
    for lv in txt:
        if lv not in stop_words:
            no_stpwrds.append(lv)
    return no_stpwrds

data_set['no_stop_words'] = data_set['tokenized_txt'].apply(lambda x: remove_stpwrds(x))
# print(data_set.head())
test_ds['no_stop_words'] = test_ds['tokenized_txt'].apply(lambda x: remove_stpwrds(x))

In [10]:
ps = PorterStemmer()
def stemming(txt):
    stem_txt = []
    for lv in txt:
        stem_txt.append(ps.stem(lv))
    return stem_txt

data_set['stem_txt'] = data_set['no_stop_words'].apply(lambda x: stemming(x))
# print(data_set.head())
test_ds['stem_txt'] = test_ds['no_stop_words'].apply(lambda x: stemming(x))

In [11]:
wn = nltk.WordNetLemmatizer()

In [12]:
def lemmatization(txt):
    lemmatized_txt = []
    for lv in txt:
         lemmatized_txt.append(wn.lemmatize(lv))
    return lemmatized_txt

data_set['lemmatized_txt'] = data_set['no_stop_words'].apply(lambda x: lemmatization(x))
# print(data_set.head())
test_ds['lemmatized_txt'] = test_ds['no_stop_words'].apply(lambda x: lemmatization(x))

In [13]:
tmp2 = data_set.iloc[:,5]
tmp3 = test_ds.iloc[:,4]
print(tmp2)
print(tmp3)

0       [realdonaldtrump, one, worst, time, american, ...
1       [crowd, oval, today, ausvind, hold, balidan, b...
2       [skroskz, shossy2, joebiden, biden, amp, son, ...
3       [etsi, shop, benedict, donald, call, presid, t...
4       [realdonaldtrump, good, build, wall, around, a...
                              ...                        
5261    [icc, allow, ms, dhoni, keep, glove, attach, i...
5262    [trump, avoid, movi, pirat, cours, illeg, down...
5263    [notic, recent, jami, oliv, restaur, closingi,...
5264    [teamindia, gear, okay, what, glove, arm, forc...
5265    [piec, paper, mccarthi, use, waiv, around, lis...
Name: stem_txt, Length: 5266, dtype: object
0                   [assang, rapist, httpstcom4sfw7csxc]
1      [gandinaaliabus, mp, say, cut, throat, muslim,...
2      [candl, light, silent, protest, mysor, mysor, ...
3      [shameonicc, 1, icc, dhoni, glove, vs, 2icc, p...
4      [icc, look, pak, teamwht, go, onnw, appropriat...
                             ... 

In [704]:
print(test_ds.head())
# tmp1 = data_set.iloc[:,0]
tmp2 = data_set.iloc[:,5]
tmp3 = test_ds.iloc[:,4]
print(tmp3.head())
value1=[' '.join([word for word in row]) for row in tmp2]
value2=[' '.join([word for word in row]) for row in tmp3]
vectorizer = TfidfVectorizer().fit(value1)
vectorized_ds = vectorizer.transform(value1)
# print(type(vectorized_ds))
# print(vectorized_ds)
vectorized_ts = vectorizer.transform(value2)
print(vectorized_ds.shape)
print(vectorized_ts.shape)

                                                text  \
0  #Assange is not a #rapist  https://t.co/M4sfW7...   
1  #GandiNaaliAbuse | Where an MP says that he wi...   
2  Candle light silent protest in MYSORE, by Myso...   
3  #ShameOnICC  1. ICC on Dhoni's gloves         ...   
4  #ICC ...look at pak team...wht is going on.......   

                                        no_punct_txt  \
0        Assange is not a rapist  httpstcoM4sfW7csXC   
1  GandiNaaliAbuse  Where an MP says that he will...   
2  Candle light silent protest in MYSORE by Mysor...   
3  ShameOnICC  1 ICC on Dhonis gloves            ...   
4  ICC look at pak teamwht is going onnw this is ...   

                                       tokenized_txt  \
0  [assange, is, not, a, rapist, httpstcom4sfw7csxc]   
1  [gandinaaliabuse, where, an, mp, says, that, h...   
2  [candle, light, silent, protest, in, mysore, b...   
3  [shameonicc, 1, icc, on, dhonis, gloves, vs, 2...   
4  [icc, look, at, pak, teamwht, is, going, on

In [705]:
# Splitting
train_text, validate_text, train_labels, validate_labels = train_test_split(vectorized_ds, labels, test_size=0.3, random_state=42)
# print(type(train_text))
# print(type(validate_text))
# print(type(train_labels))
# print(type(validate_labels))

In [706]:
# Svm classifier
svclassifier = SVC(kernel = 'linear' , C = 1.0)

svclassifier.fit(train_text, train_labels)

pred_labels = svclassifier.predict(validate_text)
# f1_score(validate_labels, pred_labels, average='macro')
f1_score(validate_labels, pred_labels, average='micro')
# f1_score(validate_labels, pred_labels, average='weighted')
# f1_score(validate_labels, pred_labels, average=None)

0.660759493670886

In [707]:
# accuracy = metrics.accuracy_score(validate_labels, pred_labels)
# print("accuracy",accuracy)

In [708]:
# Logistic regression 
classifier = LogisticRegression(random_state = 0) 

print(train_text.shape)
print(type(train_text))
print(train_labels.shape)
print(type(train_labels))
print(validate_labels.shape)
print(type(validate_text))
print(validate_text.shape)
print(type(validate_labels))

classifier.fit(train_text, train_labels) 
pred_labels = classifier.predict(validate_text)
print(type(pred_labels))
print("f1 score:", f1_score(validate_labels, pred_labels))
print ("accuracy : ", accuracy_score(validate_labels, pred_labels))

(3686, 16253)
<class 'scipy.sparse.csr.csr_matrix'>
(3686,)
<class 'pandas.core.series.Series'>
(1580,)
<class 'scipy.sparse.csr.csr_matrix'>
(1580, 16253)
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
f1 score: 0.7658089838639337
accuracy :  0.660126582278481


In [709]:
# Logistic regression on test data
# print(vectorized_ds.shape)
# print(type(vectorized_ds))
# print(labels.shape)
# print(type(labels))
# print(vectorized_ts.shape)
# print(type(vectorized_ts))
classifier = LogisticRegression(random_state = 0) 
classifier.fit(vectorized_ds, labels) 
pred_labels = classifier.predict(vectorized_ts) 
# print(type(pred_labels))
# print(pred_labels.shape)
# print(pred_labels)
# pred_labels = list(pred_labels)
# print(type(pred_labels))

In [710]:
numpy.savetxt("submission.csv",pred_labels,header='labels',fmt='%d',comments='')