In [2]:
# Global vars
data_dir = "../../data/"
train_csv = data_dir + "train.csv"
test_csv = data_dir + "test.csv"
test_labels_csv = data_dir + "test_labels.csv"

In [3]:
import numpy as np
import spacy
import pandas as pd
import texthero as hero
from texthero import preprocessing
import threading

### Setup (run once)

In [4]:
# Uncomment the following lines and run them once
# !python -m spacy download en_core_web_lg # Need to load the large model to get the vectors

## Preparing

In [5]:
# Note: If this fails (after first installation) please restar the kernel of Jupyter
nlp = spacy.load('en_core_web_lg')

In [18]:
# Loading data
comments = pd.read_csv(data_dir + 'train.csv')
# comments = comments.head(1000)

### 1. Cleaning Data

In [19]:
# Prepare the cleaning pipeline
custom_pipeline = [ preprocessing.fillna,
                    preprocessing.lowercase,
                    preprocessing.remove_digits,
                    preprocessing.remove_punctuation,
                    preprocessing.remove_diacritics,
                    #preprocessing.remove_stopwords,
                    preprocessing.remove_whitespace,
                    preprocessing.stem]

# Clean the data
comments['comment_text'] = hero.clean(comments['comment_text'], pipeline = custom_pipeline)
comments.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explan whi the edit made under my usernam hard...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he match this background colour i m seem...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m realli not tri to edit war it s ju...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make ani real suggest on improv i...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero ani chanc you rememb what ...,0,0,0,0,0,0


### Word Embedding

In [20]:
np.array([1,2]).size

2

In [21]:
from math import ceil

def print_arr_for_test(index, arr):
  print(f"[thread#{index}] first={arr[0]} last={arr[arr.size-1]}")

def parallel_exec(nproc=1, thread_func=None, data=np.array([])):
  data_cache = np.array([])
  threads = []
  # Prepare the array dividing
  size = data.size
  div_indeces = []
  div_width = ceil((size / nproc))
  for i in range(nproc + 1):
    current_div =  div_width * i
    if((current_div) > size):
      div_indeces.append(size)
    else:
      div_indeces.append(current_div)
  # Create threads
  for i in range(nproc + 1):
    if i == nproc:
      break
    thread = threading.Thread(target=thread_func, args=(i ,data[div_indeces[i]:div_indeces[i+1]],))
    threads.append(thread)
  # Start all threads
  for i in range(nproc):
    threads[i].start()
  # Wait for all threads to finish
  for i in range(nproc):
    threads[i].join()

# Simple test (uncomment to see)
# parallel_exec(nproc=8, thread_func=print_arr_for_test, data=np.array(list(range(100))))

In [25]:
def vectorizing_thread(_, data_for_thread):
    data_for_thread = np.array([nlp(text).vector for text in data_for_thread])

# with nlp.disable_pipes():
parallel_exec(nproc=144, thread_func=vectorizing_thread, data=comments.comment_text)
doc_vectors = np.array([nlp(text).vector for text in comments.comment_text])

In [16]:
doc_vectors[0:100]

array([[ 0.0158172 ,  0.06881687, -0.10856419, ...,  0.03626214,
         0.02597796,  0.14716129],
       [-0.04313187,  0.28973004, -0.15522698, ...,  0.076267  ,
        -0.01267175,  0.23302144],
       [-0.01319434,  0.13964319, -0.21224962, ...,  0.00339534,
        -0.00679155,  0.10196492],
       ...,
       [-0.14291227,  0.11810604, -0.07171337, ..., -0.16393983,
         0.01586574,  0.07069676],
       [-0.04500954,  0.16452092, -0.12751812, ...,  0.0241076 ,
         0.04981592,  0.12124237],
       [-0.02163135,  0.16488743, -0.25848255, ..., -0.11882128,
         0.18570466,  0.11126722]], dtype=float32)

#### Saving vectors

In [9]:
np.savetxt(data_dir + "dataset_vectors.csv", doc_vectors, delimiter=",")
#doc_vectors = pd.read_csv("toxic_clean_vectors.csv")
doc_vectors.shape

(1000, 300)

### Toxic Classifier

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.toxic,
                                                    test_size=0.2, random_state=1)

In [11]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

# Set dual=False to speed up training, and it's not needed
svc_toxic = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_toxic.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [12]:
print(f"roc_auc_score: {roc_auc_score( y_test, svc_toxic.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 72.579%


In [13]:
import pickle
pickle.dump(svc_toxic, open('svc_toxic', 'wb'))

In [14]:
#loaded_model = pickle.load(open('model_svc', 'rb'))
#result = loaded_model.score(X_test, y_test)

In [15]:
#print(result)

### Severe Toxic

In [16]:
#For Severe Toxic
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.severe_toxic,
                                                    test_size=0.2, random_state=1)

In [17]:
svc_severe = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_severe.fit(X_train, y_train)
print(f"roc_auc_score: {roc_auc_score( y_test, svc_severe.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 50.000%


In [18]:
pickle.dump(svc_severe, open('svc_severe', 'wb'))

### Obscene

In [19]:
#For obscene
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.obscene,
                                                    test_size=0.2, random_state=1)

In [20]:
svc_obscene = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_obscene.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [21]:
print(f"roc_auc_score: {roc_auc_score( y_test, svc_obscene.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 78.901%


In [22]:
pickle.dump(svc_obscene, open('svc_obscene', 'wb'))

### Threat

In [23]:
#For Threat
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.threat,
                                                    test_size=0.2, random_state=1)

In [24]:
svc_threat = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_threat.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [25]:
print(f"roc_auc_score: {roc_auc_score( y_test, svc_threat.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 50.000%


In [26]:
pickle.dump(svc_threat, open('svc_threat', 'wb'))

### Insult

In [27]:
#For Insult
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.insult,
                                                    test_size=0.2, random_state=1)

In [28]:
svc_insult = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_insult.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [29]:
print(f"roc_auc_score: {roc_auc_score( y_test, svc_insult.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 63.372%


In [31]:
pickle.dump(svc_insult, open('svc_insult', 'wb'))

### Identity Hate

In [32]:
#For identity_hate
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, comments_sample.identity_hate,
                                                    test_size=0.2, random_state=1)

In [33]:
svc_identity_hate = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc_identity_hate.fit(X_train, y_train)

LinearSVC(dual=False, max_iter=10000, random_state=1)

In [34]:
print(f"roc_auc_score: {roc_auc_score( y_test, svc_identity_hate.predict(X_test)) * 100:.3f}%", )

roc_auc_score: 50.000%


In [36]:
pickle.dump(svc_identity_hate, open('svc_identity_hate', 'wb'))

In [38]:
test_comments = pd.read_csv(data_dir + 'test.csv')
test_comments.head()


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [39]:
test_comments_sample = test_comments
test_comments_sample['comment_text'] = hero.clean(test_comments_sample['comment_text'], pipeline = custom_pipeline)
with nlp.disable_pipes():
    test_doc_vectors = np.array([nlp(text).vector for text in test_comments_sample.comment_text])
np.savetxt("test_vectors.csv", test_doc_vectors, delimiter=",")


In [40]:
predictions = pd.DataFrame()
predictions["id"] = test_comments_sample.id
predictions["toxic"] = svc_toxic.predict(test_doc_vectors)
predictions["severe_toxic"] = svc_severe.predict(test_doc_vectors)
predictions["obscene"] = svc_obscene.predict(test_doc_vectors)
predictions["threat"] = svc_threat.predict(test_doc_vectors)
predictions["insult"] = svc_insult.predict(test_doc_vectors)
predictions["identity_hate"] = svc_identity_hate.predict(test_doc_vectors)

In [41]:
predictions.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1,0,1,0,1,0
1,0000247867823ef7,0,0,0,0,0,0
2,00013b17ad220c46,0,0,0,0,0,0
3,00017563c3f7919a,0,0,0,0,0,0
4,00017695ad8997eb,0,0,0,0,0,0


In [42]:
predictions.to_csv(data_dir + 'submission.csv',index=False)

In [43]:
predictions.shape

(153164, 7)