In [12]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
print(os.listdir("./input"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [23]:
train_file = bz2.BZ2File('./input/train.ft.txt.bz2')
test_file = bz2.BZ2File('./input/test.ft.txt.bz2')

train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

del train_file, test_file

### Parsing Text

In [24]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])
                                                       
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

del train_file_lines, test_file_lines
gc.collect()

381

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [16]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [17]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 

In [18]:
X_train, X_cv, train_labels, cv_labels = train_test_split(train_sentences, train_labels, train_size=5000)
X_test, X_cv, test_labels, cv_labels = train_test_split(test_sentences, test_labels, train_size=1000)



In [19]:
grid_svm.fit(X_train, train_labels)
grid_svm.score(X_test, test_labels)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   42.5s finished


0.9027517573777841

In [20]:
grid_svm.best_params_

{'svc__C': 0.01}

In [21]:
grid_svm.best_score_

0.9039966164957141

In [25]:
X_train, X_cv, y_train, cv_labels = train_test_split(train_sentences, train_labels, train_size=10000)
X_test, X_cv, y_test, cv_labels = train_test_split(test_sentences, test_labels, train_size=2500)



In [26]:
grid_svm.fit(X_train, y_train)
grid_svm.score(X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.3min finished


0.9359710807408257

In [27]:
grid_svm.best_params_

{'svc__C': 0.01}

In [28]:
grid_svm.best_score_

0.9206001716612753

In [29]:
X_train, X_cv, y_train, cv_labels = train_test_split(train_sentences, train_labels, train_size=20000)
X_test, X_cv, y_test, cv_labels = train_test_split(test_sentences, test_labels, train_size=3000)



In [30]:
grid_svm.fit(X_train, y_train)
grid_svm.score(X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 16.0min finished


0.9278201557250672

In [31]:
grid_svm.best_params_

{'svc__C': 0.01}

In [32]:
grid_svm.best_score_

0.930887632675267