In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import gensim
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
#loading data
data = pd.read_csv("../data/train.csv")

In [4]:
EMBEDDINGS = '../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDINGS, binary=True)

In [5]:
import re
import operator

def build_dictionary(questions):
    d = {}
    for sentence in tqdm(questions):
        for word in sentence:
            try:
                d[word] += 1
            except KeyError:
                d[word] = 1
    return d

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    covered_word_count = 0
    oov_word_count = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            covered_word_count += vocab[word]
        except:

            oov[word] = vocab[word]
            oov_word_count += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(covered_word_count / (covered_word_count + oov_word_count)))
    return sorted(oov.items(), key=operator.itemgetter(1))[::-1]

def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'}

mispellings, mispellings_re = _get_mispell(mispell_dict)

def correct_mispelling(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

to_remove = ['a','to','of','and']
def remove_stop_words(x):
    return [word for word in x if word not in to_remove]

In [6]:
def print_datasets_info(df):
    print("{} rows loaded...".format(df.shape[0]))
    print("columns are: {}".format(list(df.columns)))
    print("{0:.2f}% of sincere questions".format(len(df[df['target'] == 0])*100/df.shape[0]))
    print("{0:.2f}% of insincere questions".format(len(df[df['target'] == 1])*100/df.shape[0]))
print_datasets_info(data)

1306122 rows loaded...
columns are: ['qid', 'question_text', 'target']
93.81% of sincere questions
6.19% of insincere questions


In [7]:
reducted_data = data.sample(frac=0.01)
print_datasets_info(reducted_data)

13061 rows loaded...
columns are: ['qid', 'question_text', 'target']
93.93% of sincere questions
6.07% of insincere questions


In [8]:
#dictionary and text coverage with cleaned text
print("Cleaning and tokenizing questions...")
cleaned_questions = reducted_data['question_text']\
    .progress_apply(lambda x: clean_text(x))\
    .progress_apply(lambda x: clean_numbers(x))\
    .progress_apply(lambda x: correct_mispelling(x))\
    .progress_apply(lambda x: word_tokenize(x))
print("Removing stop words...")
cleaned_questions = [remove_stop_words(sentence) for sentence in tqdm(cleaned_questions)]
print("Building dictionary...")
dictionary = build_dictionary(cleaned_questions)
print("Checking coverage...")
out_of_dict = check_coverage(dictionary, embeddings_index)

Cleaning and tokenizing questions...


HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))


Removing stop words...


HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))


Building dictionary...


HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))


Checking coverage...


HBox(children=(IntProgress(value=0, max=19143), HTML(value='')))


Found embeddings for 92.79% of vocab
Found embeddings for  99.01% of all text


In [21]:
def text_to_array(text, max_text_len = 30):
    empyt_emb = np.zeros(300)
    embeds = [embeddings_index[x] if x in embeddings_index else empyt_emb for x in text[:max_text_len]]
    embeds+= [empyt_emb] * (max_text_len - len(embeds))
    return np.array(embeds)

print("Converting sentence in embedded vectors")
X = np.array([text_to_array(sentence) for sentence in tqdm(cleaned_questions)])
y = reducted_data['target']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Converting sentence in embedded vectors


HBox(children=(IntProgress(value=0, max=13061), HTML(value='')))




In [46]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')

def textPreProcessing(text):
    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords, pos=True):
        tokens = [w for w in tokenize_text(text) if w[:w.rfind('/')] not in stop_words]
        return ' '.join(tokens)
    
    def addPOS(text):
        tokens = ["{}/{}".format(word[0], word[1]) for word in nltk.pos_tag(tokenize_text(text))]
        return ' '.join(tokens)
    
    text = remove_special_characters(text)
    text = text.lower() # lowercase
    text = addPOS(text) #add part-of-speech
    text = remove_stopwords(text) # remove stopwords
    text = text.strip(' ') # strip whitespaces
    text = stem_text(text) # stemming
    #text.strip(' ') #strip whitespaces again?
    
    return text
    

In [7]:
textPreProcessing(X.iloc[1])

'$ boyfriend/nn says/vbz loves/vbz crush/nn another/dt girl/nn'

In [28]:
from textblob import TextBlob
from pattern.en import parse, Sentence, parse
from pattern.en import modality, mood

def calculate_features(X):    
    def polarity_subj(text):
        t = TextBlob(text)
        return t.sentiment.polarity, t.sentiment.subjectivity
    
    def feature_mood(text):
        print(text)
        t = parse(text, lemmata=True)
        t = Sentence(t)
        return mood(t)
    
    def build_features_dict(x):
        d = {'polarity': polarity_subj(x)[0], 
             'subjectivity': polarity_subj(x)[1],
             'indicative': 0,
             'imperative': 0,
             'conditional': 0,
             'subjunctive': 0}
        d[mood(x).lower()] = 1
        return d
    
    v = DictVectorizer(sparse=True)
    return v.fit_transform([build_features_dict(x) for x in X])

In [63]:
from scipy.sparse import hstack

features_vect = calculate_features(X)

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             preprocessor=textPreProcessing, max_features=10000)
dict_vect = vectorizer.fit_transform(X)

X_vect = hstack([dict_vect, features_vect])
#X_vect = calculate_features(X)

In [None]:
#from sklearn.neural_network import MLPClassifier
#clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(30,), random_state=1, verbose=True)

In [None]:
#clf.fit(training_vectors, y_train[:50000])

In [62]:
X_vect

<13061x5006 sparse matrix of type '<class 'numpy.float64'>'
	with 194485 stored elements in COOrdinate format>

In [34]:
from sklearn import svm
svm = svm.LinearSVC(C=1, max_iter=10000)

In [35]:
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'precision', 'recall', 'f1']
reshaped = X.reshape(len(X),300*30)
scores = cross_validate(svm, reshaped, y, cv=5, verbose=2, scoring=scoring)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=   5.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.5s remaining:    0.0s


[CV] ................................................. , total=   6.0s
[CV]  ................................................................
[CV] ................................................. , total=   6.1s
[CV]  ................................................................
[CV] ................................................. , total=   5.4s
[CV]  ................................................................
[CV] ................................................. , total=   5.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   29.8s finished


In [37]:
def print_cv_results(scores):
    print("TEST RESULT OF {}-FOLD CV: ".format(len(scores['fit_time'])))
    print("\tAccuracy: {:.4}".format(scores['test_accuracy'].mean()))
    print("\tPrecision: {:.4}".format(scores['test_precision'].mean()))
    print("\tRecall: {:.4}".format(scores['test_recall'].mean()))
    print("\tF1: {:.4}".format(scores['test_f1'].mean()))
    
print_cv_results(scores)

TEST RESULT OF 5-FOLD CV: 
	Accuracy: 0.9318
	Precision: 0.431
	Recall: 0.377
	F1: 0.4017
