In [41]:
import numpy as np
import pandas as pd
import string
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier


# Read data
train_data = pd.read_csv("train.csv", sep = '\t')
test_data = pd.read_csv("test.csv", sep = '\t')
test_label = pd.read_csv("sample_submission.csv", sep = ',')

In [43]:
# Data Preprocessing
def remove_punctuation(text):
    text = "".join([word for word in text if word not in string.punctuation])
    return text

def scrub_text(text):
    # To lower case
    text = text.lower()
    
    # Remove html markup
    text = re.sub("(<.*?>)", "", text)
    
    # Remove non-ascii and digits
    text = re.sub("(\\W|\\d)", " ", text)
    
    # Remove whitespace
    text = text.strip()
    
    # Remove punctuation
    text = remove_punctuation(text)
    
    return text

train_data.text = train_data.text.apply(lambda x: scrub_words(x))
test_data.text = test_data.text.apply(lambda x: scrub_words(x))

In [46]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
porter_stemmer = PorterStemmer()

def stem_text(text):
    # Tokenize
    text = tokenizer.tokenize(text)
    
    # Stemming
    text = " ".join([porter_stemmer.stem(word = word) for word in text])
    
    return text

train_data.text = train_data.text.apply(lambda x: stem_text(x))
test_data.text = test_data.text.apply(lambda x: stem_text(x))

In [48]:
# Import Spacy Stopwords
# sp = spacy.load('en_core_web_sm')
# spacy_stopwords = sp.Defaults.stop_words
spacy_stopwords = ['few', 'third', 'other', 'among', 're', 'really', 'four', 'hence', 'whatever', 'already', 'quite', 'ours', 'even', 'than', 'about', 'mine', 'he', 'through', 'are', 'sometime', 'full', 'with', 'besides', 'becomes', 'rather', 'per', 'on', 'used', 'anyway', 'his', 'after', 'everywhere', 'whence', 'see', 'next', 'just', 'though', 'yet', 'nine', 'most', 'made', 'never', 'out', 'themselves', 'thereafter', 'eleven', 'moreover', 'please', 'its', 'fifty', 'himself', 'myself', 'so', 'an', 'keep', 'could', '‘d', 'using', 'wherever', 'around', 'once', 'in', 'unless', 'somehow', 'whereupon', 'cannot', 'namely', 'behind', 'to', 'last', 'however', 'me', 'beyond', 'of', 'were', 'being', 'put', 'or', 'because', 'another', 'herself', 'same', 'since', 'become', 'nowhere', 'they', 'my', 'whither', 'anyone', 'why', 'those', 'therefore', 'onto', 'doing', 'perhaps', 'there', 'i', 'amongst', 'sixty', 'while', 'up', 'although', 'get', 'she', 'these', 'latterly', 'no', 'nor', 'mostly', '’d', 'had', 'should', 'whenever', 'go', 'six', 'where', 'whom', 'became', 'again', 'until', 'under', 'yourselves', 'your', 'has', 'twenty', 'hereby', 'elsewhere', 'done', 'beside', 'you', 'not', 'below', 'along', 'make', 'but', 'someone', 'her', 'still', 'their', 'first', 'ever', 'forty', 'will', 'herein', '‘ve', 'would', 'many', 'thence', 'indeed', 'which', 'enough', 'nevertheless', 'towards', 'whereas', 'several', 'for', 'itself', 'noone', '’re', 'empty', 'our', 'ca', 'all', 'latter', '’s', 'such', 'thru', "'ve", 'and', 'been', 'upon', 'seemed', 'before', 'above', 'by', 'one', 'various', 'hundred', 'less', 'them', 'amount', 'neither', 'might', 'us', 'say', 'toward', 'some', 'hers', 'now', 'can', 'only', '’m', 'own', '‘re', 'be', 'least', 'thereupon', 'well', 'throughout', 'often', 'always', "n't", 'ten', 'am', 'ourselves', 'fifteen', 'is', 'formerly', 'name', 'then', 'alone', 'seems', 'whereby', 'nothing', 'across', 'via', '‘ll', '‘m', 'the', "'m", 'as', 'n‘t', 'we', 'call', 'except', 'every', 'whereafter', 'further', 'five', 'former', 'any', 'against', 'serious', 'without', 'hereafter', 'each', 'anywhere', "'re", 'do', 'did', 'else', 'down', 'almost', 'due', 'also', 'at', 'wherein', 'from', 'anyhow', 'afterwards', 'twelve', 'otherwise', "'s", 'too', 'give', 'something', '’ve', 'two', 'side', 'within', 'him', 'front', 'hereupon', 'here', 'thereby', 'may', 'this', 'seem', 'top', 'becoming', 'beforehand', 'either', "'d", 'during', 'a', 'when', 'off', 'thus', 'move', '’ll', 'whose', 'back', 'was', 'everything', 'whole', 'everyone', 'show', 'between', 'it', 'anything', 'that', 'bottom', 'part', 'over', 'somewhere', 'more', 'whoever', 'n’t', 'into', 'take', 'who', 'whether', 'does', 'seeming', 'both', 'none', 'much', 'yourself', 'meanwhile', 'regarding', 'must', '‘s', 'have', 'if', "'ll", 'therein', 'very', 'what', 'how', 'others', 'yours', 'nobody', 'eight', 'three', 'together', 'sometimes']

In [54]:
# Word to TfidfVec
tv = TfidfVectorizer(stop_words = spacy_stopwords, token_pattern = "(?u)\\b\\w+\\b", smooth_idf = True, max_features = 10000)
x_train = tv.fit_transform(train_data.text).toarray()
x_test = tv.fit_transform(test_data.text).toarray()
y_train = train_data.label
y_test = test_label.label.astype(str)

In [55]:
# Evaluation Function
def Evaluation(y_test, preds):
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test, preds))
    print("Accuracy: %.5g" %(metrics.accuracy_score(y_test, preds)))
    print("Precision: %.5g" %(metrics.precision_score(y_test, preds, pos_label = '1')))
    print("Recall: %.5g" %(metrics.recall_score(y_test, preds, pos_label = '1')))
    print("F1 Score: %.5g" %(metrics.f1_score(y_test, preds, pos_label = '1')))

In [59]:
# XGBoost
XGBoost_model = XGBClassifier(n_estimators = 100, max_features = 100, max_depth = 5, learning_rate = 0.1)
XGBoost_model.fit(x_train, y_train)
preds = XGBoost_model.predict(x_test)
Evaluation(y_test, preds)

Confusion Matrix:
[[614  16]
 [602  15]]
Accuracy: 0.50441
Precision: 0.48387
Recall: 0.024311
F1 Score: 0.046296


In [60]:
# GBDT
GBDT_model = GradientBoostingClassifier(n_estimators = 100, max_features = 100, max_depth = 5, learning_rate = 0.1)
GBDT_model.fit(x_train, y_train)
preds = GBDT_model.predict(x_test)
Evaluation(y_test,preds)

Confusion Matrix:
[[601  29]
 [595  22]]
Accuracy: 0.4996
Precision: 0.43137
Recall: 0.035656
F1 Score: 0.065868


In [61]:
# LightGBM
LightGBM_model = LGBMClassifier(n_estimators = 100, max_features = 100, max_depth = 5, learning_rate = 0.1)
LightGBM_model.fit(x_train, y_train)
prediction = LightGBM_model.predict(x_test)
Evaluation(y_test,preds)

Confusion Matrix:
[[601  29]
 [595  22]]
Accuracy: 0.4996
Precision: 0.43137
Recall: 0.035656
F1 Score: 0.065868
