In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('F:/notebook_working/kaggle_compe/train.csv').fillna('No data')
test = pd.read_csv('F:/notebook_working/kaggle_compe/test.csv').fillna('No data')

#train.id = train.id.astype('str')
#train.comment_text = train.comment_text.astype('str')

#test.id = test.id.astype('str')
#test.comment_text = test.comment_text.astype('str')




In [4]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])


In [5]:
print train_text.shape
print test_text.shape
print train.info()
print test.info()
print all_text.shape
#print all_text

(159571,)
(153164,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 8.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
id              153164 non-null object
comment_text    153164 non-null object
dtypes: object(2)
memory usage: 1.2+ MB
None
(312735,)


In [6]:
#dont use errors='ignore'
#all_text.comment_text = map(lambda x:unicode(x, 'utf-8'), all_text.comment_text)
all_text = map(lambda x:unicode(x, 'utf-8'), all_text)

In [7]:
#extracting the stopwords from nltk library
sw = stopwords.words('english')
# displaying the stopwords
np.array(sw)

array([u'i', u'me', u'my', u'myself', u'we', u'our', u'ours',
       u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd",
       u'your', u'yours', u'yourself', u'yourselves', u'he', u'him',
       u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself',
       u'it', u"it's", u'its', u'itself', u'they', u'them', u'their',
       u'theirs', u'themselves', u'what', u'which', u'who', u'whom',
       u'this', u'that', u"that'll", u'these', u'those', u'am', u'is',
       u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has',
       u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an',
       u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until',
       u'while', u'of', u'at', u'by', u'for', u'with', u'about',
       u'against', u'between', u'into', u'through', u'during', u'before',
       u'after', u'above', u'below', u'to', u'from', u'up', u'down',
       u'in', u'out', u'on', u'off', u'over', u'under', u'again',
       u'further'

In [8]:
def stopwords(text):
    '''a function for removing the stopword'''
# removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

In [9]:
#####all_text['comment_text'] = all_text['comment_text'].apply(stopwords)
#all_text.head(10)
all_text = map(stopwords, all_text)
#all_text

In [10]:
PunctuationToRemove = [".", ",", ":", ";", "!" ,"?", "&", "\"", "\'", "~", "\\"]

In [11]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    for char in PunctuationToRemove:
        text = text.replace(char,"")
    return text

In [12]:
#all_text['comment_text'] = all_text['comment_text'].apply(remove_punctuation)
#all_text.head(10)
all_text = map(remove_punctuation, all_text)
#all_text

#separate best words

In [13]:
tox_train = train.loc[train.toxic==1, ]
sev_train = train.loc[train.severe_toxic==1, ]
obs_train = train.loc[train.obscene==1, ]
threat_train = train.loc[train.threat==1, ]
ins_train = train.loc[train.insult==1, ]
ident_train = train.loc[train.identity_hate==1, ]


In [14]:
tox_text = map(lambda x:unicode(x, 'utf-8'), tox_train.comment_text)
tox_text = map(stopwords, tox_text)
tox_text = map(remove_punctuation, tox_text)

In [15]:
#for toxic
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.0001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(tox_text)
tox_word_features = word_vectorizer.transform(tox_text)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
tox_words = word_vectorizer.get_feature_names()[541:]
#tox_words

There are 14867 tokens in Comment_text if we use word


In [21]:
sev_text = map(lambda x:unicode(x, 'utf-8'), sev_train.comment_text)
sev_text = map(stopwords, sev_text)
sev_text = map(remove_punctuation, sev_text)

In [22]:
#for sev_toxic
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(sev_text)
sev_word_features = word_vectorizer.transform(sev_text)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
sev_words = word_vectorizer.get_feature_names()
#sev_words

There are 1993 tokens in Comment_text if we use word


In [23]:
obs_text = map(lambda x:unicode(x, 'utf-8'), obs_train.comment_text)
obs_text = map(stopwords, obs_text)
obs_text = map(remove_punctuation, obs_text)

In [24]:
#for obs
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(obs_text)
obs_word_features = word_vectorizer.transform(obs_text)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
obs_words = word_vectorizer.get_feature_names()

There are 2159 tokens in Comment_text if we use word


In [25]:
threat_text = map(lambda x:unicode(x, 'utf-8'), threat_train.comment_text)
threat_text = map(stopwords, threat_text)
threat_text = map(remove_punctuation, threat_text)

In [26]:
#for threat
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.0001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(threat_text)
threat_word_features = word_vectorizer.transform(threat_text)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
threat_words = word_vectorizer.get_feature_names()[69:]
#threat_words

There are 2838 tokens in Comment_text if we use word


In [124]:
#threat_words

In [37]:
ins_text = map(lambda x:unicode(x, 'utf-8'), ins_train.comment_text)
ins_text = map(stopwords, ins_text)
ins_text = map(remove_punctuation, ins_text)

In [38]:
#for ins
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.0003,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(ins_text)
#tox_word_features = word_vectorizer.transform(tox_train)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
ins_words = word_vectorizer.get_feature_names()
#ins_words

There are 5623 tokens in Comment_text if we use word


In [139]:
#ins_words

In [33]:
ident_text = map(lambda x:unicode(x, 'utf-8'), ident_train.comment_text)
ident_text = map(stopwords, ident_text)
ident_text = map(remove_punctuation, ident_text)

In [34]:
#for identity
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.0001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(ident_text)
#tox_word_features = word_vectorizer.transform(tox_train)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
ident_words = word_vectorizer.get_feature_names()
#ident_words

There are 7437 tokens in Comment_text if we use word


In [146]:
#ident_words

In [35]:
test_text = map(lambda x:unicode(x, 'utf-8'), test_text)
test_text = map(stopwords, test_text)
test_text = map(remove_punctuation, test_text)

In [36]:
#for test
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    min_df=0.0003,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(test_text)
#tox_word_features = word_vectorizer.transform(tox_train)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))
test_words = word_vectorizer.get_feature_names()
#test_words

There are 7870 tokens in Comment_text if we use word


In [39]:
imp_words=[]
for  word in tox_words:
    imp_words.append(word)
    
for  word in sev_words:
    imp_words.append(word)
    
for  word in obs_words:
    imp_words.append(word)

for  word in threat_words:
    imp_words.append(word)
    
for  word in ins_words:
    imp_words.append(word)
    
for  word in ident_words:
    imp_words.append(word)

for  word in test_words:
    imp_words.append(word)

imp_words_set=set(imp_words)
imp_words = list(imp_words_set)
len(imp_words)

17979

In [41]:

def impwords(text):
    '''a function for selecting important words'''

    text = [word for word in text.split() if word in imp_words]
        # joining the list of words with space separator
    return " ".join(text)
 #   

In [42]:
all_text = map(impwords, all_text)

In [45]:
#imp_text = pd.DataFrame.from_dict({'comment': all_text})


In [50]:
#imp_text.head()

In [49]:
#imp_text.to_csv('imptext.csv', index=False)

In [51]:
#whole data
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    encoding='utf-8',
    lowercase=True,
    #min_df=0.00001,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    #use_idf=1, smooth_idf=1,
    max_features=None)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

msg = "There are {} tokens in Comment_text if we use word"
print(msg.format(len(word_vectorizer.get_feature_names())))


There are 17765 tokens in Comment_text if we use word


In [52]:
#train_features = hstack([train_char_features, train_word_features])
#test_features = hstack([test_char_features, test_word_features])
#X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_target, test_size=0.3, random_state=42)


scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression()

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_word_features, train_target)
    #submission[class_name] = classifier.predict_proba(test_word_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('F:\submission.csv', index=False)

CV score for class toxic is 0.969070410186
CV score for class severe_toxic is 0.98554066953
CV score for class obscene is 0.98544702436
CV score for class threat is 0.978086534108
CV score for class insult is 0.976024437433
CV score for class identity_hate is 0.971386806749
Total CV score is 0.977592647061


In [53]:
#train_features = hstack([train_char_features, train_word_features])
#test_features = hstack([test_char_features, test_word_features])


scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    X_train, X_valid, y_train, y_valid = train_test_split(train_word_features, train_target, test_size=0.3, random_state=42)

    classifier = LogisticRegression()

    cv_score = np.mean(cross_val_score(classifier, X_valid, y_valid, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(X_train, y_train)
    #submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.958367413029
CV score for class severe_toxic is 0.980475134741
CV score for class obscene is 0.980033299749
CV score for class threat is 0.969080010509
CV score for class insult is 0.968672055661
CV score for class identity_hate is 0.962001265406
Total CV score is 0.969771529849


In [9]:
submission[class_name] = classifier.predict_proba(test_features)[:, 1]
submission.to_csv('submission.csv', index=False)