In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

In [8]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
test_labels = pd.read_csv("test_labels.csv")

In [9]:
test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
5,0001ea8717f6de06,0,0,0,0,0,0
6,00024115d4cbde0f,-1,-1,-1,-1,-1,-1
7,000247e83dcc1211,0,0,0,0,0,0
8,00025358d4737918,-1,-1,-1,-1,-1,-1
9,00026d1092fe71cc,-1,-1,-1,-1,-1,-1


In [17]:
test_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [18]:
test_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [19]:
train_data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [22]:
train_text = train_data["comment_text"]
test_text = test_data["comment_text"]



In [24]:
text = pd.concat([train_text, test_text])

In [25]:
text

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
5         "\n\nCongratulations from me as well, use the ...
6              COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
7         Your vandalism to the Matt Shirvington article...
8         Sorry if the word 'nonsense' was offensive to ...
9         alignment on this subject and which are contra...
10        "\nFair use rationale for Image:Wonju.jpg\n\nT...
11        bbq \n\nbe a man and lets discuss it-maybe ove...
12        Hey... what is it..\n@ | talk .\nWhat is it......
13        Before you start throwing accusations and warn...
14        Oh, and the girl above started her arguments w...
15        "\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...
16        Bye! \n\nDon't look, come or t

In [41]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(text)
train_word_features = word_vectorizer.transform(train_data)
test_word_features = word_vectorizer.transform(test_data)

In [42]:
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [52]:
word_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

In [53]:
words_train = word_vec.fit_transform(text)
words_test = word_vec.transform(text)

In [54]:
word_vec.get_feature_names()

[u'00',
 u'000',
 u'0000',
 u'00000',
 u'000000',
 u'0000000',
 u'00000000',
 u'000000000000',
 u'000000000000000',
 u'0000000000000000000000000000',
 u'000000000000000000000000000000000fdgkja',
 u'00000000000111265005605361866087675053350036566001020343907867982125026173889636993408203125',
 u'000000001',
 u'0000000027',
 u'00000001',
 u'00000003',
 u'00000010',
 u'00000020',
 u'00000030',
 u'00000040',
 u'00000050',
 u'000001',
 u'00000102',
 u'0000012',
 u'0000015',
 u'000002',
 u'000002000004000008000016',
 u'000002974001',
 u'000002e4',
 u'000002e5',
 u'0000030',
 u'0000030422',
 u'00000326',
 u'0000035',
 u'0000045',
 u'000005',
 u'00000aab0f6c',
 u'00001',
 u'0000104',
 u'000013',
 u'000014',
 u'000015',
 u'000016',
 u'000018',
 u'000019',
 u'000020',
 u'000022',
 u'000023405011',
 u'000023424',
 u'000024',
 u'0000253111',
 u'0000297',
 u'000030',
 u'000031',
 u'000033',
 u'00004',
 u'000040',
 u'000045',
 u'000046',
 u'000050',
 u'000057',
 u'000060',
 u'000069',
 u'000070',
 u

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)