In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!ls ../input/sarcasm/

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

After importing and doing basic data exploration using "train_df.info()", we notice a few NULL entries in comments that can be dropped.


In [None]:
train_df = pd.read_csv('../input/sarcasm/train-balanced-sarcasm.csv')
train_df.dropna(subset=['comment'], inplace=True)

Split data (huge dataset, 1m rows) into train and val sets.

In [None]:
train_texts, valid_texts, ytrain, yvalid = \
        train_test_split(train_df['comment'], train_df['label'], train_size = 0.6, random_state=17)

Start with **data exploration** by looking at things like:
<br> - Length distribution of comments for each class
<br> - Word cloud of most common words for each class (not very useful)
<br> - Group by other categories (Subreddit, author,..) to get rankings and details sorted by 'sum', 'mean', etc.
<br> To get an idea whether other features will be helpful in classification, not just the comments.

In [None]:
# Distribution of data for each class
train_df.loc[train_df['label'] == 1, 'comment'].str.len().apply(np.log1p).hist(label='sarcastic', alpha=.5)
train_df.loc[train_df['label'] == 0, 'comment'].str.len().apply(np.log1p).hist(label='normal', alpha=.5)
plt.legend();

In [None]:
# View stats/details when grouped by other categories like subreddit
sub_df = train_df.groupby('subreddit')['label'].agg([np.size, np.mean, np.sum])
sub_df.sort_values(by='sum', ascending=False).head(10)

Creating a log loss metric function. Still not sure how this works or whether there are packages for it now.
<br>Can also use Accuracy as a metric.


In [None]:
# Create a multi-class log loss metric function
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

Starting from basic models as a sanity check.
<br>First vectorize words in comments from train/test data (ie. turning them to numbers).
<br>After which they are passed to a simple LR model and evaluated on logloss.
<br><br>*Term frequency–inverse document frequency measures 'importance',<br> converts words into numbers that account for rarity of a word and its frequency. * 

In [None]:
# Trying out Tf-id vectorize -> Logistic Regression model
   
# Always start with these features. They work (almost) everytime!    Need to check how the preprocessing of stopwords/punctuation/stemming is done?
#tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            #strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
           # ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
           # stop_words = 'english')
tfv = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=2)  # from the 'suggested answer'

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_texts) + list(valid_texts))
xtrain_tfv =  tfv.transform(train_texts) 
xvalid_tfv = tfv.transform(valid_texts)

# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1, n_jobs=4, solver='lbfgs', random_state=17, verbose=1) #from the 'suggested answer'
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

CountVectorizer is like tf-idf but without the idf portion that accounts for rarity.

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(train_texts) + list(valid_texts))
xtrain_ctv =  ctv.transform(train_texts) 
xvalid_ctv = ctv.transform(valid_texts)

# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Now try Naive Bayes, which is a probabilistic model that accounts for probability of terms but not their order.
<br> A simplistic approach that takes the product of multiple likelihoods, eg. p(Class 1) x p(Word 1 | Class 1) x p(Word 2 | Class 2) ...

In [None]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Now use Singular Value Decomposition (dimensionality reduction, too many features), 
<br>then use standard scaler and then pass it to a Support Vector Classifier model to train.

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)                                     #SVD and Scale -> Fit on train, transform on train and val
xtrain_svd = svd.transform(xtrain_tfv)                  #Whereas TF-IDF or CountVectorizer -> Fit on BOTH together
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Now we try Grid Search by initializing a pipeline and a parameter grid to iterate through,
<br> until we find the best set of hyperparameters that yields the *lowest* metric score.
<br><br> Can also look at Hyperopt for hyperparameter tuning

In [None]:
# Preparing for Grid Search: Scoring Function
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

# Pipeline Preparation: Initialize model for 1st step in pipeline
nb_model = MultinomialNB()

# Create the pipeline, inserting steps inside
clf = pipeline.Pipeline([('nb', nb_model)])

# Initialize Parameter Grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

Next we try using pre-trained word vectors with a larger 'map space' compared to employing tf-idf or countvec.
<br> We need to first convert the text file of vectors to a dict, and then to two matrices that contain the key and weights respectively.
<br> Now match the words in our dataset to the key values in GloVe, and obtain a mapping of each comment's words.
<br> After vectorizing, convert to array and pass train data to an XGBoost model for fitting and evaluation.
<br><br> The GloVe vectors used here are of dimension N=300 (as seen from the filename '..300d'). There are smaller N values available.
<br> GloVe vector sets are obtained through SVD as well I think? 

In [None]:
# Using pre trained word vectors eg GloVe, word2vec, fasttext

# load the GloVe vectors in a dictionary - preprocess text file into dict of word: vector
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Not sure what sent2vec does or why it needs to normalize for each sentence.

In [None]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
# Create sentence vectors using the above function for training and validation set
# tqdm is a loading bar lol
xtrain_glove = [sent2vec(x) for x in tqdm(train_texts)]
xvalid_glove = [sent2vec(x) for x in tqdm(valid_texts)]

# convert sentence vectors to array
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [None]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)

clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

**Improving The Model**
<br>Can add other features by fitting/transforming on a separate vectorizer from the comments, then stack the features.

In [None]:
# Improving the model by adding additional features like subreddit, author, etc

subreddits = train_df['subreddit']
train_subreddits, valid_subreddits = train_test_split(subreddits, random_state=17)

# Initialize 2 different vectorizer instances
tf_idf_texts = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=2)
tf_idf_subreddits = TfidfVectorizer(ngram_range=(1, 1))

# Use the 2 to fit/transform their corresponding data
X_train_texts = tf_idf_texts.fit_transform(train_texts)
X_valid_texts = tf_idf_texts.transform(valid_texts)

X_train_subreddits = tf_idf_subreddits.fit_transform(train_subreddits)
X_valid_subreddits = tf_idf_subreddits.transform(valid_subreddits)

# Finally, stack the vectorized data before passing to a model
from scipy.sparse import hstack
X_train = hstack([X_train_texts, X_train_subreddits])
X_valid = hstack([X_valid_texts, X_valid_subreddits])

Here we try out eli5 to display the ranking of weights in our trained model.
<br> It will show at the top the most important word used in classification.

In [None]:
# tfi-df vectorize preprocessing step
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))

# initialize and fit/train a model
model = linear_model.LogisticRegression(C=5., solver='sag')
model.fit(train_X, train_y)

# the important words used for classifying, top features by weight rank
import eli5
eli5.show_weights(model, vec=tfidf_vec, top=10, feature_filter=lambda x: x != '<BIAS>')

# or
eli5.show_weights(estimator=tfidf_logit_pipeline.named_steps['logit'],
                  vec=tfidf_logit_pipeline.named_steps['tf_idf'])