In [41]:
%matplotlib inline
%pylab inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_validate
from sklearn import svm
from inforet_package.preprocessing import *

from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

#from sklearn.model_selection import train_test_split
#from sklearn import svm

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#loading data
EMBEDDINGS = '../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
df = pd.read_csv("../data/train.csv")

embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDINGS, binary=True)

In [3]:
def print_datasets_info(df):
    print("{} rows loaded...".format(df.shape[0]))
    print("columns are: {}".format(list(df.columns)))
    print("{0:.2f}% of sincere questions".format(len(df[df['target'] == 0])*100/df.shape[0]))
    print("{0:.2f}% of insincere questions".format(len(df[df['target'] == 1])*100/df.shape[0]))

reducted_df = df.sample(frac=0.05)
print_datasets_info(reducted_df)
#questions = reducted_df['question_text'].tolist()

65306 rows loaded...
columns are: ['qid', 'question_text', 'target']
93.91% of sincere questions
6.09% of insincere questions


In [36]:
print("NOT CLEANED COVERAGE: ")
print("Building dictionary...")
vocab_dictionary = build_dictionary(df['question_text'].tolist())
out_of_dict = check_coverage(vocab_dictionary, embeddings_index)

NOT CLEANED COVERAGE: 
Building dictionary...


HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2047), HTML(value='')))

Found embeddings for 22.42% of vocab
Found embeddings for  74.15% of all text


In [37]:
vocab_dictionary

{'H': 335973,
 'o': 5982041,
 'w': 1359867,
 ' ': 15416812,
 'd': 2536343,
 'i': 5172726,
 'Q': 25902,
 'u': 2006942,
 'e': 8262611,
 'b': 1026991,
 'c': 2182249,
 'n': 4941624,
 'a': 6170667,
 't': 6090023,
 'l': 2775391,
 's': 4504441,
 'h': 3515364,
 'r': 4310239,
 'p': 1463297,
 'v': 714218,
 '1': 101658,
 '9': 20498,
 '6': 20285,
 '0': 107880,
 '?': 1381192,
 'D': 147530,
 'y': 1576610,
 'g': 1453078,
 ',': 235096,
 'W': 761319,
 'f': 1491363,
 'm': 1865095,
 'O': 44181,
 'G': 69931,
 'k': 585414,
 'M': 137704,
 'C': 214086,
 'I': 653841,
 'j': 96881,
 'z': 58752,
 'A': 217720,
 'T': 135920,
 'P': 111277,
 '.': 134056,
 '/': 42137,
 'F': 50532,
 'x': 157411,
 'S': 216609,
 '3': 32458,
 'B': 106238,
 'R': 65351,
 'X': 7314,
 'U': 66833,
 "'": 163763,
 'E': 100677,
 'q': 62794,
 'V': 30712,
 '2': 77956,
 '8': 24711,
 '-': 67094,
 '(': 56618,
 'N': 67591,
 'Y': 15990,
 ')': 56594,
 '7': 25697,
 'J': 50501,
 '"': 73272,
 'L': 53377,
 'K': 43541,
 '5': 35019,
 '$': 4242,
 '=': 4248,
 '

In [42]:
cleaned_questions = cleaning_questions(df)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

In [43]:
vocab_dictionary = build_dictionary(cleaned_questions)
out_of_dict = check_coverage(vocab_dictionary, embeddings_index)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))

HBox(children=(IntProgress(value=0, max=238749), HTML(value='')))

Found embeddings for 61.06% of vocab
Found embeddings for  99.00% of all text


In [39]:
vocab_dictionary

{'How': 263116,
 'did': 34677,
 'Quebec': 161,
 'nationalists': 131,
 'see': 9562,
 'their': 34879,
 'province': 170,
 'as': 55734,
 'nation': 1423,
 'in': 367649,
 'the': 655277,
 '#': 384385,
 's': 73604,
 'Do': 40149,
 'you': 199472,
 'have': 85523,
 'an': 67003,
 'adopted': 363,
 'dog': 3490,
 'how': 26994,
 'would': 51616,
 'encourage': 392,
 'people': 55501,
 'adopt': 485,
 'not': 51021,
 'shop': 869,
 'Why': 145161,
 'does': 64106,
 'velocity': 810,
 'affect': 4420,
 'time': 23785,
 'Does': 23211,
 'space': 3427,
 'geometry': 219,
 'Otto': 55,
 'von': 87,
 'Guericke': 4,
 'used': 13173,
 'Magdeburg': 7,
 'hemispheres': 15,
 'Can': 53062,
 'I': 325976,
 'convert': 1557,
 'montra': 3,
 'helicon': 1,
 'D': 1325,
 'mountain': 346,
 'bike': 1072,
 'by': 42288,
 'just': 16204,
 'changing': 1141,
 'tyres': 92,
 'Is': 108974,
 'Gaza': 105,
 'slowly': 361,
 'becoming': 2271,
 'Auschwitz': 29,
 'Dachau': 4,
 'or': 92349,
 'Treblinka': 1,
 'for': 201932,
 'Palestinians': 595,
 'Quora': 158

In [15]:
tfidf_vocab, TfIdfMatrix, idf_dict = idf_dictionary_builder(cleaned_questions)

In [45]:
insincere_questions = df[df['target'] == 1]
insincere_cleaned_questions = cleaning_questions(insincere_questions)
insincere_vocab_dictionary = build_dictionary(insincere_cleaned_questions)

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

HBox(children=(IntProgress(value=0, max=80810), HTML(value='')))

In [48]:
def get_specificity(vocab_corpus_A, vocab_corpus_B):
    total_A = sum([v for v in vocab_corpus_A.values()])
    total_B = sum([v for v in vocab_corpus_B.values()])
    freqA = dict((key, value/total_A) for key, value in vocab_corpus_A.items())
    freqB = dict((key, value/total_B) for key, value in vocab_corpus_B.items())
    spec = {}
    for token, f in freqB.items():
        spec[token] = (f - freqA[token]) / math.sqrt(freqA[token])
    return spec
        
spec = get_specificity(vocab_dictionary, insincere_vocab_dictionary)

In [51]:
sorted(spec.items(), key=operator.itemgetter(1))[::-1]

[('Why', 0.16559954902282073),
 ('Muslims', 0.13107139302346155),
 ('Trump', 0.12860782688272374),
 ('women', 0.1259173645863908),
 ('they', 0.11874143478196796),
 ('Americans', 0.10514714651610678),
 ('men', 0.10437643890186349),
 ('people', 0.10383667258280138),
 ('liberals', 0.10307299110164787),
 ('white', 0.10206153057081462),
 ('Indians', 0.0965714811006675),
 ('so', 0.09397992627496193),
 ('their', 0.08900792758357925),
 ('Jews', 0.07979166765324631),
 ('black', 0.07908314350338902),
 ('Muslim', 0.07907828708600269),
 ('hate', 0.07436623193177108),
 ('Democrats', 0.07380131671670201),
 ('girls', 0.07169180156432131),
 ('Hindus', 0.07037958275742844),
 ('Obama', 0.06903564965181039),
 ('racist', 0.06832736552700674),
 ('that', 0.0676469291508543),
 ('Christians', 0.06761547994755299),
 ('gay', 0.0663272778045591),
 ('t', 0.06198352150396479),
 ('Chinese', 0.060653228578266494),
 ('Donald', 0.059531578091839746),
 ('stupid', 0.058588432484150875),
 ('Hillary', 0.058214314527994596