In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_comments = train_df['comment_text']
test_comments = test_df['comment_text']

all_comments = pd.concat([train_comments, test_comments])

train_df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [5]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectorizer = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)

In [8]:
print('Start Fit vectorizer')

tfidf = vectorizer.fit(train_comments)

print('Fit vectorizer')

Start Fit vectorizer
Fit vectorizer


In [9]:
print('Start transform test comments')

test_comment_features = vectorizer.transform(test_comments)

print('Transformed test comments')

Start transform test comments
Transformed test comments


In [10]:
test_comment_features

<153164x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 5622907 stored elements in Compressed Sparse Row format>

In [11]:
test_comment_features[0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 54 stored elements in Compressed Sparse Row format>

In [12]:
print('Start transform train comments')

train_comment_features = vectorizer.transform(train_comments)

print('Transformed train comments')

Start transform train comments
Transformed train comments


In [13]:
type(test_comments)

pandas.core.series.Series

In [14]:
import pickle

pickle.dump(tfidf, open("comment_tfidf.pickle", "wb"))
pickle.dump(train_comment_features, open("train_comment_features.pickle", "wb"))
pickle.dump(test_comment_features, open("test_comment_features.pickle", "wb"))

In [15]:
vectorizer2 = TfidfVectorizer(
    analyzer='word', 
    sublinear_tf=True,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)

In [16]:
with open('comment_tfidf.pickle', 'wb') as fin:
  pickle.dump(vectorizer2, fin)

In [17]:
data = ['한글', '영문', 'good', 'smile', 'best']

In [18]:
data_result = vectorizer2.transform(data)
data_result

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [None]:
data_result.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])