In [596]:
import pandas as pd
import numpy as np
from nltk import word_tokenize, bigrams, ngrams
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, scale
from nltk.corpus import stopwords
import us
import pycountry
import re
import json
import random
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 255 ms


In [2]:
question_data = pd.read_csv('darpa_problems/r_32/data/raw_data/questions.csv')

time: 49.9 ms


In [3]:
sentence_data = pd.read_csv('darpa_problems/r_32/data/raw_data/sentences.csv')

time: 97.5 ms


In [4]:
train_data = pd.read_csv('darpa_problems/r_32/data/trainData.csv')

time: 38.9 ms


In [5]:
train_targets = pd.read_csv('darpa_problems/r_32/data/trainTargets.csv')

time: 12.5 ms


In [494]:
def mynonrandom():
 return 0.33

time: 2.47 ms


In [499]:
xx

[7, 2, 4, 1, 7]

time: 4.15 ms


In [500]:
def remove_data(data, perc_no):
    num_samples = len(data['response'])
    num_yes = num_samples*np.mean(data['response'])
    num_no = int(num_yes/(1-perc_no) - num_yes)
    yes_idxs = [idx for idx, response in enumerate(data['response']) if response == 1]
    no_idxs = [idx for idx, response in enumerate(data['response']) if response == 0]
    random.shuffle(no_idxs, mynonrandom)
    new_no_idxs = no_idxs[0:int(num_no)]
    new_idxs = yes_idxs + new_no_idxs
    new_data = {}
    for key, value in data.items():
        new_data[key] = [value[idx] for idx in new_idxs]
    return [new_data, num_yes+num_no]

time: 16.4 ms


In [283]:
with open('/Users/JasonKatz/Desktop/darpa_problems/r_32/data/data.json', 'r') as f:
    data = json.load(f)

time: 36.1 ms


In [591]:
def model_with_subset(data, max_perc=.95, model=LogisticRegression, iters = 5):
    new_data, num_samples = remove_data(data, max_perc)
    encoder = LabelBinarizer()
    input_variables = encoder.fit_transform(new_data['first_word'])
    for variable in ['matches', 'last_word', 'length', 'number', 'place', 'what_year', 'what_is', 'is', 'how_many']:
        input_variables = np.hstack((input_variables, np.array(new_data[variable])[:, np.newaxis]))
    accuracies = 0
    naive = 0
    confusion_mat = np.zeros((2,2))
    for state in range(iters):
        xtrain, xtest, ytrain, ytest = train_test_split(input_variables, new_data['response'], random_state=state)
        clf = model()
        clf.fit(xtrain, ytrain)
        ypred_test = clf.predict(xtest)
        accuracies += accuracy_score(ytest, ypred_test)
        naive += 1-np.mean(ytest)
        confusion_mat += confusion_matrix(ytest, ypred_test)
    model_accuracy = accuracies/iters
    naive_accuracy = naive/iters
    confusion_mat = (confusion_mat/iters).astype(int)
    model_total = 0
    naive_total = 0
    for state in range(iters):
        xtrain, xtest, ytrain, ytest = train_test_split(input_variables, new_data['response'], random_state=state)
        ave_yes = np.mean(ytest)
        clf = model()
        clf.fit(xtrain, ytrain)
        ypred_test = clf.predict_proba(xtest)
        single_naive = np.array([[1-ave_yes, ave_yes], [1-ave_yes, ave_yes]])
        naive_predictions = np.repeat(single_naive, [0, len(ytest)], axis=0)
        model_total += log_loss(ytest, ypred_test)
        naive_total += log_loss(ytest, naive_predictions)
    model_log_loss = model_total/iters
    naive_log_loss = naive_total/iters
    perc_no = 1 - np.mean(new_data['response'])
    return pd.DataFrame([[model.__name__, num_samples, perc_no, model_accuracy, naive_accuracy, model_log_loss, 
                          naive_log_loss, confusion_mat]], columns=['Model', 'Number_Samples', 'Percent_No', 
                                                                           'Model_Accuracy', 'Naive_Accuracy', 
                                                                           'Model_Log_Loss', 'Naive_Log_Loss', 
                                                                           'Confusion_Matrix'])

time: 260 ms


In [592]:
model_table = pd.DataFrame(columns=['Model', 'Number_Samples', 'Percent_No', 'Model_Accuracy', 'Naive_Accuracy', 
                                    'Model_Log_Loss', 'Naive_Log_Loss', 'Confusion_Matrix'])
for perc_no in [.95, .85, .75, .65, .55, .5]:
    for model in [LogisticRegression, GaussianNB, RandomForestClassifier, KNeighborsClassifier, MLPClassifier]:
        model_table = model_table.append(model_with_subset(data, perc_no, model))
model_table

Unnamed: 0,Model,Number_Samples,Percent_No,Model_Accuracy,Naive_Accuracy,Model_Log_Loss,Naive_Log_Loss,Confusion_Matrix
0,LogisticRegression,23599.0,0.948902,0.946623,0.946761,0.186489,0.207929,"[[5464, 2], [306, 1]]"
0,GaussianNB,23599.0,0.948902,0.935123,0.946761,0.288796,0.207929,"[[5373, 93], [281, 26]]"
0,RandomForestClassifier,23599.0,0.948902,0.931001,0.946761,1.113961,0.207929,"[[5354, 112], [285, 21]]"
0,KNeighborsClassifier,23599.0,0.948902,0.944475,0.946761,1.232065,0.207929,"[[5446, 20], [300, 7]]"
0,MLPClassifier,23599.0,0.948902,0.946727,0.946761,0.187652,0.207929,"[[5465, 1], [306, 0]]"
0,LogisticRegression,7866.0,0.849987,0.855211,0.851449,0.365906,0.420039,"[[1650, 24], [260, 31]]"
0,GaussianNB,7866.0,0.849987,0.837417,0.851449,0.572781,0.420039,"[[1579, 95], [224, 67]]"
0,RandomForestClassifier,7866.0,0.849987,0.823386,0.851449,1.871334,0.420039,"[[1554, 120], [227, 64]]"
0,KNeighborsClassifier,7866.0,0.849987,0.837824,0.851449,2.083036,0.420039,"[[1606, 68], [250, 41]]"
0,MLPClassifier,7866.0,0.849987,0.854906,0.851449,0.36733,0.420039,"[[1645, 29], [256, 36]]"


time: 49.5 s


In [593]:
"Mean Model Accuracy: {}%, Mean Naive Accuracy: {}%".format(100*round(np.mean(model_table['Model_Accuracy']),4), 
                                                            100*round(np.mean(model_table['Naive_Accuracy']),4))

'Mean Model Accuracy: 76.35%, Mean Naive Accuracy: 70.65%'

time: 19.1 ms


In [594]:
model_table.groupby('Percent_No').mean()

Unnamed: 0_level_0,Number_Samples,Model_Accuracy,Naive_Accuracy,Model_Log_Loss,Naive_Log_Loss
Percent_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.5,2360.0,0.652,0.494576,1.504002,0.692644
0.549962,2622.0,0.673659,0.556402,1.371466,0.686698
0.649956,3371.0,0.71535,0.646738,1.321889,0.648699
0.75,4720.0,0.757186,0.743051,1.303871,0.569562
0.849987,7866.0,0.841749,0.851449,1.052077,0.420039
0.948902,23599.0,0.94079,0.946761,0.601793,0.207929


time: 192 ms


In [595]:
model_table.groupby('Model').mean()

Unnamed: 0_level_0,Number_Samples,Percent_No,Model_Accuracy,Naive_Accuracy,Model_Log_Loss,Naive_Log_Loss
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,7423.0,0.708134,0.734005,0.706496,1.091869,0.537595
KNeighborsClassifier,7423.0,0.708134,0.751064,0.706496,2.053243,0.537595
LogisticRegression,7423.0,0.708134,0.793282,0.706496,0.446879,0.537595
MLPClassifier,7423.0,0.708134,0.793226,0.706496,0.447017,0.537595
RandomForestClassifier,7423.0,0.708134,0.7457,0.706496,1.923574,0.537595


time: 24.7 ms


In [291]:
new_data = remove_data(data, .95)[0]

time: 65.2 ms


In [293]:
encoder = LabelBinarizer()
first_word_encoded = encoder.fit_transform(new_data['first_word'])
both = np.hstack((first_word_encoded, np.array(new_data['matches'])[:, np.newaxis]))
thrice = np.hstack((both, np.array(new_data['last_word'])[:, np.newaxis]))
cuatro = np.hstack((thrice, np.array(new_data['length'])[:, np.newaxis]))
cinco = np.hstack((cuatro, np.array(new_data['number'])[:, np.newaxis]))
seis = np.hstack((cinco, np.array(new_data['place'])[:, np.newaxis]))
siete = np.hstack((seis, np.array(new_data['what_country'])[:, np.newaxis]))

time: 110 ms


In [294]:
xtrain, xtest, ytrain, ytest = train_test_split(seis, new_data['response'], random_state=state)
clf = LogisticRegression()
clf.fit(xtrain, ytrain)
ypred_test = clf.predict(xtest)

time: 97.7 ms


In [190]:
accuracies = 0
naive = 0
iters = 5
for state in range(iters):
    xtrain, xtest, ytrain, ytest = train_test_split(seis, new_data['response'], random_state=state)
    clf = LogisticRegression()
    clf.fit(xtrain, ytrain)
    ypred_test = clf.predict(xtest)
    accuracies += accuracy_score(ytest, ypred_test)
    naive += 1-np.mean(ytest)
print("Model Accuracy: {}, Naive Accuracy: {}".format(accuracies/iters, naive/iters))

Model Accuracy: 0.7638983050847458, Naive Accuracy: 0.7430508474576272
time: 87.1 ms


In [189]:
model_total = 0
naive_total = 0
iters = 10
for state in range(iters):
    xtrain, xtest, ytrain, ytest = train_test_split(seis, new_data['response'], random_state=state)
    ave_yes = np.mean(ytest)
    clf = LogisticRegression()
    clf.fit(xtrain, ytrain)
    ypred_test = clf.predict_proba(xtest)
    single_naive = np.array([[1-ave_yes, ave_yes], [1-ave_yes, ave_yes]])
    naive_predictions = np.repeat(single_naive, [0, len(ytest)], axis=0)
    model_total += log_loss(ytest, ypred_test)
    naive_total += log_loss(ytest, naive_predictions)
print("Model Accuracy: {}, Naive Accuracy: {}".format(model_total/iters, naive_total/iters))

Model Accuracy: 0.5051833492733436, Naive Accuracy: 0.5674004312670509
time: 235 ms


In [191]:
Counter(data['first_word'])

Counter({'how': 949,
         'what': 2400,
         'whatever': 2,
         'when': 428,
         'where': 362,
         'who': 579})

time: 8.57 ms


In [422]:
Counter(bigram_list).most_common(50)

[(('what', 'is'), 4549),
 (('is', 'the'), 2167),
 (('how', 'many'), 1848),
 (('in', 'the'), 1226),
 (('what', 'are'), 1107),
 (('was', 'the'), 985),
 (('is', 'a'), 793),
 (('when', 'was'), 773),
 (('of', 'the'), 772),
 (('when', 'did'), 765),
 (('are', 'the'), 736),
 (('where', 'is'), 718),
 (('what', 'does'), 712),
 (('who', 'is'), 672),
 (('did', 'the'), 627),
 (('how', 'did'), 512),
 (('what', 'was'), 477),
 (('how', 'does'), 421),
 (('how', 'much'), 416),
 (('who', 'was'), 412),
 (('does', 'the'), 388),
 (('the', 'first'), 373),
 (('on', 'the'), 315),
 (('how', 'is'), 307),
 (('does', 'a'), 285),
 (('where', 'did'), 274),
 (('what', 'did'), 261),
 (('are', 'in'), 259),
 (('what', 'year'), 255),
 (('where', 'was'), 245),
 (('the', 'world'), 232),
 (('civil', 'war'), 231),
 (('in', 'a'), 223),
 (('how', 'old'), 218),
 (('what', 'country'), 211),
 (('used', 'for'), 204),
 (('how', 'do'), 201),
 (('the', 'us'), 197),
 (('do', 'you'), 194),
 (('what', 'do'), 194),
 (('the', 'civil'), 19

time: 64.8 ms


In [604]:
list(ngrams(word_tokenize('My name is Jason Katz and I like football'), 3))

[('My', 'name', 'is'),
 ('name', 'is', 'Jason'),
 ('is', 'Jason', 'Katz'),
 ('Jason', 'Katz', 'and'),
 ('Katz', 'and', 'I'),
 ('and', 'I', 'like'),
 ('I', 'like', 'football')]

time: 5.35 ms


In [590]:
all_places = ''
for state in us.states.STATES_AND_TERRITORIES:
    all_places = all_places + ' ' + state.name
for country in pycountry.countries:
    all_places = all_places + ' ' + country.name
all_places = re.sub('[(),]', '', all_places)
places_words = word_tokenize(all_places.lower())
places = set(places_words)
for special in ["'s", 'the', 'part', 'of', 'and']:
    places.remove(special)
    
countries = ''
for country in pycountry.countries:
    countries = countries + ' ' + country.name
countries = re.sub('[(),]', '', countries)
countries_words = word_tokenize(countries.lower())
countries = set(countries_words)
for special in ["'s", 'the', 'part', 'of', 'and']:
    countries.remove(special)

# Iterate through all question-sentence pairs
data = {'what_country': [], 'country': [], 'first_word': [], 'length': [], 'matches': [], 'number': [], 
        'last_word': [], 'place': [], 'what_year': [], 'what_is': [], 'is': [], 'how_many': [], 'what_are': [], 
        'are': [], 'in_the': [], 'in': [], 'response': []}
for index, row in train_data[0:].iterrows():
    question = question_data.iloc[row['qIndex']]['question']
    sentence = sentence_data.iloc[row['sIndex']]['sentence']

    # Break the text into individual words
    tokenized_question = [word.lower() for word in word_tokenize(question)]
    tokenized_sentence = [word.lower() for word in word_tokenize(sentence)]

    # Get first word of the question and number of words in the sentence
    first_word_question = tokenized_question[0]
    length_sentence = len(tokenized_sentence)
    
    
    question_bigrams = list(bigrams(tokenized_question))
    sentence_bigrams = list(bigrams(tokenized_sentence))
    what_country = ('what', 'country') in question_bigrams
    what_year = ('what', 'year') in question_bigrams
    what_is = ('what', 'is') in question_bigrams
    how_many = ('how', 'many') in question_bigrams
    what_are = ('what', 'are') in question_bigrams
    in_the = ('in', 'the') in question_bigrams
    contains_is = "is" in tokenized_sentence
    contains_in = "in" in tokenized_sentence
    contains_are = "are" in tokenized_sentence

    # Words to ignore (prepositions, pronouns, etc.)
    common_words = set(stopwords.words('english'))

    # Count how many words from the question appear in the sentence
    match = 0
    for word in tokenized_question:
        match += word in tokenized_sentence and word not in common_words

    # Check if the sentence contains a number
    contains_number = 0
    for word in tokenized_sentence:
        if word.isdigit():
            contains_number = 1
            break

    # Check if the last word in the question appears in the sentence
    last_word = tokenized_question[-1]
    if last_word == '?':
        last_word = tokenized_question[-2]

    # Check if the sentence contains a place (country name or US state name)
    contains_place = 0
    for word in tokenized_sentence:
        if word in places:
            contains_place = 1
            break
            
    # Check if the sentence contains a country
    contains_country = 0
    for word in tokenized_sentence:
        if word in countries:
            contains_country = 1
            break

    # Append metrics from question-sentence pair
    data['response'].append(int(train_targets.iloc[index]['isAnswer']))
    data['what_country'].append(int(what_country))
    data['what_year'].append(int(what_year))
    data['first_word'].append(first_word_question)
    data['length'].append(length_sentence)
    data['matches'].append(match)
    data['number'].append(contains_number)
    data['last_word'].append(int(last_word in tokenized_sentence))
    data['place'].append(contains_place)
    data['country'].append(contains_country)
    data['what_is'].append(what_is)
    data['is'].append(int(contains_is))
    data['how_many'].append(int(how_many))
    data['what_are'].append(int(what_are))
    data['are'].append(int(contains_are))
    data['in_the'].append(int(in_the))
    data['in'].append(int(contains_in))

# Write data to file
with open('/Users/JasonKatz/Desktop/darpa_problems/r_32/data/data.json', 'w') as fd:
    fd.write(json.dumps(data, indent=4))

time: 40.2 s


In [410]:
bigram_list = []
for index, row in train_data[0:].iterrows():
    question = question_data.iloc[row['qIndex']]['question']
    tokenized_question = [word.lower() for word in word_tokenize(question)]
    bigram_list.extend(list(bigrams(tokenized_question)))

Counter({('how', 'are'): 131,
         ('are', 'glacier'): 5,
         ('glacier', 'caves'): 5,
         ('caves', 'formed'): 5,
         ('formed', '?'): 5,
         ('are', 'the'): 736,
         ('the', 'directions'): 7,
         ('directions', 'of'): 7,
         ('of', 'the'): 772,
         ('the', 'velocity'): 7,
         ('velocity', 'and'): 7,
         ('and', 'force'): 7,
         ('force', 'vectors'): 7,
         ('vectors', 'related'): 7,
         ('related', 'in'): 7,
         ('in', 'a'): 223,
         ('a', 'circular'): 7,
         ('circular', 'motion'): 7,
         ('how', 'did'): 512,
         ('did', 'apollo'): 8,
         ('apollo', 'creed'): 8,
         ('creed', 'die'): 8,
         ('how', 'long'): 189,
         ('long', 'is'): 50,
         ('is', 'the'): 2167,
         ('the', 'term'): 42,
         ('term', 'for'): 7,
         ('for', 'federal'): 7,
         ('federal', 'judges'): 7,
         ('how', 'a'): 46,
         ('a', 'beretta'): 3,
         ('beretta', 'mode

time: 11.1 s
