In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [46]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_comments = train_df['comment_text']
test_comments = test_df['comment_text']

all_comments = pd.concat([train_comments, test_comments])

train_df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [47]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)

In [50]:
tfidf = vectorizer.fit(train_comments)

In [51]:
test_comment_features = vectorizer.transform(test_comments)

In [57]:
test_comments

0         Yo bitch Ja Rule is more succesful then you'll...
1         == From RfC == \n\n The title is fine as it is...
2         " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3         :If you have a look back at the source, the in...
4                 I don't anonymously edit articles at all.
                                ...                        
153159    . \n i totally agree, this stuff is nothing bu...
153160    == Throw from out field to home plate. == \n\n...
153161    " \n\n == Okinotorishima categories == \n\n I ...
153162    " \n\n == ""One of the founding nations of the...
153163    " \n :::Stop already. Your bullshit is not wel...
Name: comment_text, Length: 153164, dtype: object

In [52]:
type(test_comments)

pandas.core.series.Series

In [56]:
print(vectorizer.vocabulary_)



In [54]:
print(test_comment_features)

  (0, 9970)	0.10452557270364031
  (0, 9966)	0.1663649791113118
  (0, 9923)	0.1046611591824922
  (0, 9853)	0.05592079713991835
  (0, 9769)	0.22877832958370248
  (0, 9751)	0.1706980940964145
  (0, 9685)	0.06076819544736872
  (0, 9433)	0.1627852406319036
  (0, 9415)	0.07864945871408828
  (0, 9095)	0.09510046320845846
  (0, 9071)	0.034952888792526664
  (0, 9048)	0.08115362451332188
  (0, 8990)	0.10253541903129552
  (0, 8966)	0.0813617318660931
  (0, 8951)	0.04303781801309164
  (0, 8267)	0.19908736521161588
  (0, 8158)	0.07259644955760465
  (0, 8138)	0.12988556573044485
  (0, 7850)	0.15049701607500982
  (0, 7822)	0.2193280608376796
  (0, 7726)	0.09429386230593405
  (0, 6979)	0.18197413618951022
  (0, 6342)	0.049849795860249645
  (0, 6110)	0.11993235979970561
  (0, 5996)	0.0627629601415411
  :	:
  (153162, 589)	0.18580372954344918
  (153162, 522)	0.09162839599657487
  (153162, 418)	0.05858083662245296
  (153163, 9974)	0.13071983875659693
  (153163, 9970)	0.15240896573852775
  (153163, 9738)	

In [102]:
st="explanation why the edits removing"

In [103]:
str_vector = vectorizer.transform([st])

In [105]:
type(str_vector)

scipy.sparse.csr.csr_matrix

In [106]:
type(str_vector.toarray())

numpy.ndarray

In [107]:
print(str_vector)

  (0, 9783)	0.36037572858465816
  (0, 8954)	0.14651983710955452
  (0, 7522)	0.5558352083785455
  (0, 3517)	0.5989653182553812
  (0, 3184)	0.4253810529801195


In [104]:
str_vector.toarray()[0][7522]

0.5558352083785455

In [94]:
vectorizer.vocabulary_.get('explanation')

3517

In [89]:
sorted(vectorizer.vocabulary_.items())

[('0', 0),
 ('00', 1),
 ('000', 2),
 ('000000', 3),
 ('01', 4),
 ('02', 5),
 ('03', 6),
 ('04', 7),
 ('05', 8),
 ('06', 9),
 ('07', 10),
 ('08', 11),
 ('084080', 12),
 ('09', 13),
 ('0px', 14),
 ('1', 15),
 ('10', 16),
 ('100', 17),
 ('1000', 18),
 ('101', 19),
 ('102', 20),
 ('103', 21),
 ('104', 22),
 ('105', 23),
 ('106', 24),
 ('107', 25),
 ('108', 26),
 ('109', 27),
 ('10th', 28),
 ('11', 29),
 ('110', 30),
 ('111', 31),
 ('112', 32),
 ('113', 33),
 ('114', 34),
 ('115', 35),
 ('116', 36),
 ('117', 37),
 ('118', 38),
 ('119', 39),
 ('11th', 40),
 ('12', 41),
 ('120', 42),
 ('121', 43),
 ('122', 44),
 ('123', 45),
 ('124', 46),
 ('125', 47),
 ('126', 48),
 ('127', 49),
 ('128', 50),
 ('129', 51),
 ('12th', 52),
 ('13', 53),
 ('130', 54),
 ('131', 55),
 ('132', 56),
 ('133', 57),
 ('134', 58),
 ('135', 59),
 ('136', 60),
 ('137', 61),
 ('138', 62),
 ('139', 63),
 ('13th', 64),
 ('14', 65),
 ('140', 66),
 ('141', 67),
 ('142', 68),
 ('143', 69),
 ('144', 70),
 ('145', 71),
 ('146', 7

In [38]:
import pickle

pickle.dump(tfidf, open("comment_tfidf.pickle", "wb"))
# pickle.dump(train_comment_features, open("train_comment_features.pickle", "wb"))
# pickle.dump(test_comment_features, open("test_comment_features.pickle", "wb"))

In [39]:
vect2 = pickle.load(open('comment_tfidf.pickle', 'rb'))

In [40]:
vect2

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [41]:
data = ['한글 best 최고 안녕']

In [42]:
data_result = vect2.transform(data)
data_result

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [43]:
data_result.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [44]:
import joblib

model = joblib.load('model')

FileNotFoundError: [Errno 2] No such file or directory: 'model'

In [None]:
 predict_result = model.predict(data_result)
 predict_result

AttributeError: ignored