In [3]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/MyDrive/Colab Notebooks/CS7650/final")
os.listdir()

Mounted at /content/drive


['data', 'Preprocessing', 'Models', 'resources.gdoc', 'BiLSTM.pt']

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from collections import Counter

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.snowball.SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for c in classes:
  print(c)
  print(df[c].value_counts(normalize=True))
  print('------')
df[classes].value_counts(normalize=True)

toxic
0    0.904156
1    0.095844
Name: toxic, dtype: float64
------
severe_toxic
0    0.990004
1    0.009996
Name: severe_toxic, dtype: float64
------
obscene
0    0.947052
1    0.052948
Name: obscene, dtype: float64
------
threat
0    0.997004
1    0.002996
Name: threat, dtype: float64
------
insult
0    0.950636
1    0.049364
Name: insult, dtype: float64
------
identity_hate
0    0.991195
1    0.008805
Name: identity_hate, dtype: float64
------


toxic  severe_toxic  obscene  threat  insult  identity_hate
0      0             0        0       0       0                0.898321
1      0             0        0       0       0                0.035508
                     1        0       1       0                0.023814
                                      0       0                0.011017
                     0        0       1       0                0.007614
       1             1        0       1       0                0.006198
       0             1        0       1       1                0.003873
0      0             1        0       0       0                0.001987
                     0        0       1       0                0.001886
1      1             1        0       1       1                0.001661
0      0             1        0       1       0                0.001134
1      1             1        0       0       0                0.000990
       0             0        0       0       1                0.000852
    

In [None]:
df_test = pd.read_csv('data/test.csv')
df_test_labels = pd.read_csv('data/test_labels.csv')
df_test = pd.merge(df_test, df_test_labels, on=['id'])
df_test = df_test[df_test['toxic'] != -1]
df_test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [None]:
def clean_text(df, remove_stop=True, stem_words=True):
  df = df.copy(deep=True)
  #newlines and other excessive whitespaces
  df['comment_text'] = df['comment_text'].str.replace('\s+', ' ')
  #lowercase
  df['comment_text'] = df['comment_text'].str.lower()
  #honestly not sure if we need to do the stopwords, punctuation, and stuff.
  if remove_stop:
    df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(w for w in x.split() if w not in stopwords))
  if stem_words:
    df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(stemmer.stem(w) for w in x.split()))
  #what do I do with conjunctions?
  #punctuation
  df['comment_text'] = df['comment_text'].str.replace('[^\w\s]','')
  #keep only alnum
  pattern = re.compile("[A-Za-z0-9]+")
  df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join(w for w in x.split() if pattern.fullmatch(w)))
	#drop rows with empty comments
  return df[df['comment_text'].str.contains('[a-z0-9]')]

def prepare_text(df, count_vec=True, glove=False):
  df = df.copy(deep=True)
  
  return df

In [None]:
df = clean_text(df, remove_stop=False, stem_words=False)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he matches this background colour im seem...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really not trying to edit war its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make any real suggestions on impro...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [None]:
df_test = clean_text(df_test, remove_stop=True, stem_words=False)
df_test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,thank understand think high would revert witho...,0,0,0,0,0,0
7,000247e83dcc1211,dear god site horribl,0,0,0,0,0,0
11,0002f87b16116a7f,somebodi invari tri add religion realli mean w...,0,0,0,0,0,0
13,0003e1cccfd5a40a,say right type type institut need case three l...,0,0,0,0,0,0
14,00059ace3e3e9a53,ad new product list make sure relev ad new pro...,0,0,0,0,0,0


In [None]:
df.to_csv('data/clean/train_clean.csv')

In [None]:
df_test.to_csv('data/clean/test_clean_stop.csv', index=False)