In [1]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
#loading data
df = pd.read_csv("../data/train.csv")

In [24]:
def print_datasets_info(df):
    print("{} rows loaded...".format(df.shape[0]))
    print("columns are: {}".format(list(df.columns)))
    print("{0:.2f}% of sincere questions".format(len(df[df['target'] == 0])*100/df.shape[0]))
    print("{0:.2f}% of insincere questions".format(len(df[df['target'] == 1])*100/df.shape[0]))
reducted_df = df.sample(frac=0.01)
print_datasets_info(reducted_df)
questions = reducted_df['question_text'].tolist()

13061 rows loaded...
columns are: ['qid', 'question_text', 'target']
94.17% of sincere questions
5.83% of insincere questions


In [35]:
def iob_annotator(doc):
    return [
        "{}_{}_{}".format(token.lemma_, token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_)
        for token in doc if not token.is_stop
    ]

In [36]:
nlp = spacy.load('en')
#add stop words
my_stop_words = ["what", "?", "if", "how", "difficult", "can", "why"]
for w in my_stop_words:
    #print(nlp.vocab[w].is_stop)
    nlp.vocab[w].is_stop = True
    print(nlp.vocab[w].is_stop)

True
True
True
True
True
True
True


In [41]:
#for question in questions: 
#features = [iob_annotator(nlp(question)) for question in tqdm(questions)]
#doc = nlp(questions[1])
#print(doc)
#for token in doc:
#    print(token.lemma_, token.is_stop)
#print(iob_annotator(doc))
#print(nlp.Defaults.stop_words)
#for token in doc: 
#    print(token.text, token.is_stop)
nlp.vocab["hello"].is_stop
#iob_annotator(doc)

False

In [None]:
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)    
X = tfidf.fit_transform(features)

In [None]:
svm2 = svm.LinearSVC(C=1, max_iter=10000)
X_train, X_val, y_train, y_val = train_test_split(X, reducted_df['target'], test_size=0.2, random_state=1)

svm2.fit(X_train, y_train)

def plot_coefficients(classifier, feature_names, top_features=80):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    # create plot
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
    plt.show()
    #print coeffs
    for i in range(top_features):
        print("{0:20} {1:20} {2:30} {3}".format(feature_names[top_coefficients][i], coef[top_coefficients][i],
                                                 feature_names[top_coefficients][top_features*2-1-i], coef[top_coefficients][top_features*2-1-i]))

plot_coefficients(svm2, tfidf.get_feature_names())

In [None]:
svm1 = svm.LinearSVC(C=1, max_iter=10000)

In [None]:
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(svm1, X, reducted_df['target'], cv=5, verbose=2, scoring=scoring)

In [None]:
def print_cv_results(scores):
    print("TEST RESULT OF {}-FOLD CV: ".format(len(scores['fit_time'])))
    print("\tAccuracy: {:.4}".format(scores['test_accuracy'].mean()))
    print("\tPrecision: {:.4}".format(scores['test_precision'].mean()))
    print("\tRecall: {:.4}".format(scores['test_recall'].mean()))
    print("\tF1: {:.4}".format(scores['test_f1'].mean()))
       
print_cv_results(scores)