In [1]:
%matplotlib inline
%pylab inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_validate
from sklearn import svm
from inforet_package.preprocessing import *

from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

#from sklearn.model_selection import train_test_split
#from sklearn import svm

Populating the interactive namespace from numpy and matplotlib


In [2]:
#loading data
EMBEDDINGS = '../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
df = pd.read_csv("../data/train.csv")

embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDINGS, binary=True)

In [3]:
def print_datasets_info(df):
    print("{} rows loaded...".format(df.shape[0]))
    print("columns are: {}".format(list(df.columns)))
    print("{0:.2f}% of sincere questions".format(len(df[df['target'] == 0])*100/df.shape[0]))
    print("{0:.2f}% of insincere questions".format(len(df[df['target'] == 1])*100/df.shape[0]))

reducted_df = df.sample(frac=0.05)
print_datasets_info(reducted_df)
#questions = reducted_df['question_text'].tolist()

65306 rows loaded...
columns are: ['qid', 'question_text', 'target']
93.81% of sincere questions
6.19% of insincere questions


In [4]:
print("NOT CLEANED COVERAGE: ")
print("Building dictionary...")
vocab_dictionary = build_dictionary(df['question_text'].tolist())
out_of_dict = check_coverage(vocab_dictionary, embeddings_index)

NOT CLEANED COVERAGE: 
Building dictionary...


HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))


Found embeddings for 22.42% of vocab
Found embeddings for  74.15% of all text


In [None]:
vocab_dictionary

In [8]:
cleaned_questions = cleaning_questions(df)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




In [9]:
vocab_dictionary = build_dictionary(cleaned_questions)
out_of_dict = check_coverage(vocab_dictionary, embeddings_index)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=238754), HTML(value='')))


Found embeddings for 61.06% of vocab
Found embeddings for  90.97% of all text


In [None]:
vocab_dictionary

In [6]:
tfidf_vocab, TfIdfMatrix, idf_dict = idf_dictionary_builder(cleaned_questions)
idf_dict

[('￼￼Assuming', 14.389426584528286),
 ('Ｘ', 14.389426584528286),
 ('Ｈow', 14.389426584528286),
 ('？such', 14.389426584528286),
 ('？please', 14.389426584528286),
 ('？How', 14.389426584528286),
 ('＞', 14.389426584528286),
 ('：if', 14.389426584528286),
 ('，Can', 14.389426584528286),
 ('）and', 14.389426584528286),
 ('（海南人の日本', 14.389426584528286),
 ('（・・）', 14.389426584528286),
 ('（The', 14.389426584528286),
 ('（', 14.389426584528286),
 ('＄', 14.389426584528286),
 ('\ufeffWhat', 14.389426584528286),
 ('ﬁle', 14.389426584528286),
 ('\uf0d8what', 14.389426584528286),
 ('\uf02d', 14.389426584528286),
 ('흡', 14.389426584528286),
 ('호', 14.389426584528286),
 ('혀', 14.389426584528286),
 ('행복하게', 14.389426584528286),
 ('했다', 14.389426584528286),
 ('할', 14.389426584528286),
 ('한국어를', 14.389426584528286),
 ('한국', 14.389426584528286),
 ('한', 14.389426584528286),
 ('하다한다', 14.389426584528286),
 ('하기를', 14.389426584528286),
 ('하', 14.389426584528286),
 ('포경수술', 14.389426584528286),
 ('키', 14.389426584

In [7]:
idf_dict[::-1]

[('the', 1.9443311330332378),
 ('What', 2.1171051624314554),
 ('to', 2.3537034476110748),
 ('a', 2.381472615853867),
 ('in', 2.3851397003596775),
 ('is', 2.42628574271182),
 ('of', 2.500131887524957),
 ('How', 2.6092296842029588),
 ('I', 2.6996905537509166),
 ('and', 2.771600709732687),
 ('are', 2.8529603896420754),
 ('do', 2.8717673415855445),
 ('for', 2.9415357637443176),
 ('you', 3.097122853328757),
 ('Why', 3.21375866445311),
 ('can', 3.314401613309491),
 ('it', 3.3347738259280937),
 ('Is', 3.491817267164156),
 ('#', 3.5468794880653003),
 ('that', 3.5911477662312388),
 ('my', 3.6810492447116827),
 ('with', 3.6851152699267153),
 ('on', 3.691861132853771),
 ('be', 3.7064192200638058),
 ('or', 3.7122377360883343),
 ('have', 3.7823218536490972),
 ('s', 3.9472404923121225),
 ('from', 3.9890196534007356),
 ('an', 4.021230647843353),
 ('does', 4.030462283797879),
 ('if', 4.037685863338778),
 ('best', 4.049233688147936),
 ('your', 4.0534101410013434),
 ('get', 4.065995904703339),
 ('some',

In [None]:
insincere_questions = df[df['target'] == 1]
insincere_cleaned_questions = cleaning_questions(insincere_questions)
insincere_vocab_dictionary = build_dictionary(insincere_cleaned_questions)

In [None]:
def get_specificity(vocab_corpus_A, vocab_corpus_B):
    total_A = sum([v for v in vocab_corpus_A.values()])
    total_B = sum([v for v in vocab_corpus_B.values()])
    freqA = dict((key, value/total_A) for key, value in vocab_corpus_A.items())
    freqB = dict((key, value/total_B) for key, value in vocab_corpus_B.items())
    spec = {}
    for token, f in freqB.items():
        spec[token] = (f - freqA[token]) / math.sqrt(freqA[token])
    return spec
        
spec = get_specificity(vocab_dictionary, insincere_vocab_dictionary)

In [None]:
sorted(spec.items(), key=operator.itemgetter(1))[::-1]