In [46]:
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from torchtext.vocab import GloVe, vocab
import torchtext
from torchtext.data.utils import get_tokenizer

In [47]:
import nltk
def data_cleaning(text):
    # 大小写
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("#" , " ")
    text = text.replace("." , "")
    text = text.replace("%", "")


    text = re.sub('https?://[A-Za-z0-9./]+', '', text)
    text = re.sub('http?://[A-Za-z0-9./]+', '', text)
    text = re.sub('www.[A-Za-z0-9./]+', '', text)
    text = re.sub("\d+", "", text)

    # Stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stopwords])


    encoded_string = text.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

In [48]:
data_folder_path = '/mnt/a/OneDrive/UNSW/COMP9444/9444_toxic_comment_classification/data/'
df = pd.read_csv(data_folder_path + 'train.csv')
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [49]:
df['comment_text'] = df['comment_text'].apply(data_cleaning)

In [50]:
def count_words(comment):
    comment = comment.split()
    return len(comment)
lens = df['comment_text'].apply(count_words)
lens.describe(), np.quantile(lens, 0.98)

(count    159571.000000
 mean         39.459563
 std          60.610472
 min           0.000000
 25%          10.000000
 50%          21.000000
 75%          43.000000
 max        1411.000000
 Name: comment_text, dtype: float64,
 225.0)

In [52]:
df.head()
df.to_csv(data_folder_path + 'train_cleaned.csv', index=False)

In [41]:
num_class = len(classes)
glove_vectors = GloVe('twitter.27B', dim=100)
glove_vocab = vocab(glove_vectors.stoi)
print(glove_vectors['fucccck'])

tensor([-0.0196,  0.2471,  0.0933,  0.9827, -0.4932, -0.5208, -0.5511, -0.2384,
         0.0633,  0.2640, -0.0855, -0.0142, -0.2185,  0.0615, -0.0203, -0.5917,
        -0.5143, -0.4775,  0.0501, -0.5398, -0.4444,  0.5033,  0.5590, -0.1862,
        -0.1925,  0.8136,  0.0792,  0.6563, -0.2354, -0.3717, -0.3733,  0.2547,
         0.5859,  0.2547, -1.5402,  0.4177, -0.4178,  0.6288,  0.1763, -0.1376,
        -0.0983,  0.4113,  0.1282, -0.9423, -0.8857,  0.9278, -0.1266,  0.5425,
         0.6687,  1.1172,  0.1437,  0.1578, -0.1240, -0.2762,  0.0530, -0.0099,
        -0.4356, -0.1290,  0.0104,  0.6156,  0.5624,  0.0147,  0.2363,  0.2392,
        -0.4270,  0.2027,  0.0283,  0.2958,  0.7260,  0.1783,  0.0921,  0.3728,
         0.2069, -0.1167, -0.4945, -0.8139,  0.7111,  0.1753,  0.3670,  1.3172,
         0.4816,  0.0983, -0.3783, -0.1295,  0.2076,  0.6489, -0.7640, -0.6700,
        -0.2590, -0.8446, -0.0727,  0.3674,  0.7326,  0.3010, -0.8171,  0.6957,
         0.1134,  0.1629,  0.4557,  0.01

In [42]:
glove_vocab.insert_token('<unk>', 0)
glove_vocab.set_default_index(0)
pretrained_embeddings = glove_vectors.vectors
pretrained_embeddings = torch.cat((torch.zeros(1,pretrained_embeddings.shape[1]),pretrained_embeddings))

In [43]:
# print(glove_vectors['<unk>'])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


In [44]:
# pretrained_embeddings[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

In [57]:
from torchdata.datapipes.iter import FileOpener,IterableWrapper
url_dp = IterableWrapper([data_folder_path + 'train_cleaned.csv'])
data_dp = FileOpener(url_dp, mode="b")
print(data_dp[0])

NotImplementedError: 

In [56]:
class ToxicDataset(Dataset):



IndentationError: expected an indented block (139337640.py, line 3)

In [90]:
comment.shape

torch.Size([18, 200])

In [91]:
padded = pack_sequence(train["comment_text"])

RuntimeError: [enforce fail at alloc_cpu.cpp:66] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 631626054400 bytes. Error code 12 (Cannot allocate memory)