In [None]:
import pandas as pd

In [None]:
original_train = pd.read_csv('QuoraQuestions/train.csv')
original_test = pd.read_csv('QuoraQuestions/test.csv')

In [None]:
original_train.head()

In [None]:
original_test.head()

In [None]:
len(original_train[original_train['target'] == 1])

In [None]:
original_train[original_train['target'] == 1].head()['question_text'][30] # definitively insincere

In [None]:
original_train[original_train['target'] == 1].head()['question_text'][110] # on the cusp - could be considered insincere bc "blacks" is not necessarily PC

In [None]:
original_train[original_train['target'] == 1].head()['question_text'][114] # lascivious and this is borderline incest

In [None]:
original_train[original_train['target'] == 1].head()['question_text'][115] # definitely insincere

In [None]:
original_train[original_train['target'] == 1].tail()['question_text'][1306093] # part 2 - incest

In [None]:
original_train[original_train['target'] == 1].tail()['question_text'][1306099] # racist towards pakistani people

In [None]:
original_train[original_train['target'] == 1].tail()['question_text'][1306094] # provocative and trying to make a statement

In [None]:
split_train1 = list(map(lambda x : x.lower().split(), original_train[original_train['target'] == 1]['question_text']))
split_train0 = list(map(lambda x : x.lower().split(), original_train[original_train['target'] == 0]['question_text']))

## Proportion of Different Question Types

In [None]:
# determine question type counts among sincere and sincere questions
search1 = 'why'
search2 = 'how'
search3 = 'what'
search4 = 'do'
insincere_questions = [[] for _ in range(4)]
sincere_questions = [[] for _ in range(4)]
for sublist in split_train1:
    if sublist[0] == search1:
        insincere_questions[0].append(sublist)
    elif sublist[0] == search2:
        insincere_questions[1].append(sublist)
    elif sublist[0] == search3:
        insincere_questions[2].append(sublist)
    elif sublist[0] == search4:
        insincere_questions[3].append(sublist)
for sublist in split_train0:
    if sublist[0] == search1:
        sincere_questions[0].append(sublist)
    elif sublist[0] == search2:
        sincere_questions[1].append(sublist)
    elif sublist[0] == search3:
        sincere_questions[2].append(sublist)
    elif sublist[0] == search4:
        sincere_questions[3].append(sublist)

In [None]:
insincere_question_prop = [len(insincere_questions[0])/num_insincere, len(insincere_questions[1])/num_insincere, len(insincere_questions[2])/num_insincere, len(insincere_questions[3])/num_insincere]
sincere_question_prop = [len(sincere_questions[0])/num_sincere, len(sincere_questions[1])/num_sincere, len(sincere_questions[2])/num_sincere, len(sincere_questions[3])/num_sincere]
question_names = [search1, search2, search3, search4]
print question_names
print insincere_question_prop
print sincere_question_prop

### Plot Question Proportions

In [None]:
plt.barh(question_names, insincere_question_prop, color = 'teal')
plt.title('Proportion of Question Types Among Insincere Qs', fontsize = 15)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 13)
plt.xlabel('Density', fontsize = 13)
plt.ylabel('Question', fontsize = 14)
plt.show()

In [None]:
plt.barh(question_names, sincere_question_prop, color = 'orange')
plt.title('Proportion of Question Types Among Sincere Qs', fontsize = 15)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 13)
plt.xlabel('Density', fontsize = 13)
plt.ylabel('Question', fontsize = 14)
plt.show()

#### Comment: 
People who ask insincere questions use significantly more "why" questions than any other question type. The "why" question assumes a statement to be true, which can be problematic in some cases. For many insincere "why" questions, the underlying assumption is inconclusive or is subjective, so the individual is more likely to be asserting an opinion rather than genuinely seeking the answer to a question.

In [None]:
# use this to determine common n-grams
#val = all(x in sublist for x in [search1]) # use this to find a combination of words in a sentence
from nltk.util import ngrams
search1 = 'jews'
search2 = 'and'
search3 = 'blacks'
questions = []
for sublist in split_train1:
    n_gram = list(ngrams(sublist, 3)) 
    ng = (search1, search2, search3)
    if ng in n_gram:
        questions.append(sublist)    

In [None]:
len(questions)

## Plot Common Word Frequencies

In [None]:
import itertools
concatenated_split_train1 = list(itertools.chain.from_iterable(split_train1))
concatenated_split_train0 = list(itertools.chain.from_iterable(split_train0))
concatenated_split_train1[:5]

In [None]:
from nltk.corpus import stopwords
s = stopwords.words('english')
concatenated_split_train1 = list(filter(lambda x : x not in s, concatenated_split_train1))  # filter out all stop words (e.g. pronouns, articles)
concatenated_split_train0 = list(filter(lambda x : x not in s, concatenated_split_train0))

In [None]:
import matplotlib.pyplot as plt
top = pd.Series(concatenated_split_train1).value_counts()[:40] # top 40 words 
plt.figure(figsize=(15,15))
plt.title("Top 40 Most Frequent Words - Insincere Questions", fontsize = 20)
plt.yticks(fontsize=14)
plt.xticks(fontsize=15)
plt.ylabel('Word', fontsize = 17)
plt.xlabel('Count', fontsize = 17)
top.plot(kind='barh', color = 'teal')
plt.show()

In [None]:
import matplotlib.pyplot as plt
#top = pd.Series(concatenated_split_train0).value_counts()[:40] # top 40 words 
plt.figure(figsize=(15,15))
plt.title("Top 40 Most Frequent Words - Sincere Questions", fontsize = 20)
plt.yticks(fontsize=14)
plt.xticks(fontsize=15)
plt.ylabel('Word', fontsize = 17)
plt.xlabel('Count', fontsize = 17)
top.plot(kind='barh', color = 'orange')
plt.show()

# Create Features for Training/Testing Data
Our input values will be constructed as follows:
* Features = columns
* Samples = rows

Let's concatenate the questions from the training data and testing data so that we can create the features based on the entire dataset. If we were to do this for the training and testing datasets individually, this would cause errors when we try to predict values from our testing dataset due to different number of dimensions in the number of columns in train compared to test data.

In [None]:
# concatenate train and test
import copy
X = copy.deepcopy(original_train['question_text'])
Z = copy.deepcopy(original_test['question_text'])
X = X.append(Z)
X= X.reset_index(drop = True)
print X.shape # number of samples

In [None]:
print len(Z) # length of test dataset
print float(len(Z))/len(X) # proportion of the entire dataset that is testing data

In [None]:
print float(len(X) - len(Z))/len(X) # proportion of the dataset that is training data

#### Let's split our training dataset into train and validation

In [None]:
from IPython.display import Image
Image(filename='datasplit.png')
# this is how our data will be split

In [None]:
train_size = 1230000
print float(train_size)/len(X) # first 90% will be train data

valid_size = len(X) - len(Z)
print float(valid_size)/len(X) -  float(train_size)/len(X) # the remaining ~6% will be validation data

#### Now, create features through word counts

In [None]:
# gets word counts of all unique words in the dataset - Bag of Words Representation (order doesn't matter)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words = 'english') # remove stop_words (e.g. the, a, in, pronouns, etc.)
X_train_counts = count_vect.fit_transform(X)

In [None]:
print X_train_counts.shape

In [None]:
# converts word counts into word frequencies with values between 0 and 1 - this also normalizes the data
# word frequencies are calculated by quora question not frequency across the entire dataset
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape # the shape here is the (num_samples, num_features) where num_features == unique words

In [None]:
X[0] # initial format of question

In [None]:
X_train_tfidf[0] # 1 sample - the question converted into a vector of counts and then a vector of word frequencies 

# Cross Validation

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_validate
scorer = ['accuracy', 'precision', 'recall', 'f1']
algorithm = BernoulliNB()
num_splits = 5
cv_results = cross_validate(algorithm, X_train_tfidf[:len(X) - len(Z)], original_train['target'], scoring = ('accuracy', 'precision', 'recall', 'f1'), cv = num_splits, return_train_score = True)

In [None]:
cv_results.keys()

In [None]:
num_splits = 5
print "Accuracy: " + str(sum(cv_results['test_accuracy'])/num_splits)
print "Precision: " + str(sum(cv_results['test_precision'])/num_splits)
print "Recall: " + str(sum(cv_results['test_recall'])/num_splits)
print "Harmonic Mean: " + str(sum(cv_results['test_f1'])/num_splits)

# Hihi! Don't worry about anything below here - I'm still making changes:)

# Fit Model 
#### Bernoulli Naive Bayes Classifier 
Note: I used a BernoulliNB Classifier since other classifiers have a much longer run time or have poorer performance.

In [None]:
# This function displays the metrics for evaluation of the model.
from sklearn import metrics
def results(expected, actual):
    not_equal = actual[actual != expected]
    fn = not_equal[not_equal == 0] # false negatives
    accuracy = metrics.accuracy_score(expected, actual)
    recall = metrics.recall_score(expected, actual)
    precision = metrics.precision_score(expected, actual)
    harmonic_mean = 2 * (precision * recall) / (precision + recall)

    print 'Accuracy Score: ' + str(accuracy) # accuracy score based on our validation data 
    print 'Recall Score: ' + str(recall)
    print 'Precision Score: ' + str(precision)
    print 'Harmonic Mean: ' + str(harmonic_mean) + '\n'
    print 'Expected Insincere: ' + str(len(expected[expected == 1])) # insincere
    print 'Actual Number Insincere: ' + str(len(actual[actual == 1])) # insincere
    print 'Number of False Negatives: ' + str(len(fn))
    print 'Actual Number Sincere: ' + str(len(actual[actual == 0])) # sincere
    print 'Total: ' + str(len(expected))

In [None]:
# This function fits the data to a model and yields fitted values from the model
def model(alg, x, y, training_size, validation_size):
    classifier = alg.fit(x[:training_size], y[:training_size]) 
    validation = classifier.predict(x[training_size:validation_size]) 
    predicted = classifier.predict(X_train_tfidf[validation_size:]) # no target data available
    results(original_train['target'][training_size:validation_size], validation)
    return validation, predicted

In [None]:
# This function displays some of the sample questions and their predicted target values
import numpy as np
def display_samples(question_type, x_data, v_size, pred):
    if question_type:
        indices = np.where(pred == 1)
    else: 
        indices = np.where(pred == 0)
    print 'Here are a few samples with their target values.'
    count = 0
    for i in list(indices[0]):
        if count > 10: # display 10 samples
            break
        print x_data[v_size + i]
        print pred[i] 
        count = count + 1

#### Base Model 
For the base model, let's set all target values to 0, where 0 is the class for sincere questions, since the majority of questions are sincere. You'll see that the number of insincere predicted values is 0. This is because we didn't train on any insincere values. Also, the recall and precision scores are all 0. This is because we did not train on any data where the target value == 1, so it's not possible to have a true positive (value == 1), which is the numerator of the recall and precision scores. 
* recall = TP / TP + FN
* precision = TP / TP + FP

In [None]:
from sklearn.naive_bayes import BernoulliNB
algorithm = BernoulliNB()
y_base = list(copy.deepcopy(original_train['target'][:train_size]))
y_base = list(map(lambda x: x*0, y_base))
y_valid, y_pred = model(algorithm, X_train_tfidf, y_base, train_size, valid_size)   

#### Model With Observed Data
Now, let's see how the model performs on our observed data.

In [None]:
from sklearn.naive_bayes import BernoulliNB
algorithm = BernoulliNB()
y_valid, y_pred = model(algorithm, X_train_tfidf, original_train['target'], train_size, valid_size) 
# we want all of the proportions below to be as close to 1 as possible

In [None]:
# This function displays questions which were predicted to be sincere but Quora deems them to be insincere 
def display_false_negatives(expected, actual, input_questions, index):
    fn_indices = list(np.where((actual != expected) & (actual == 0))[0])
    print np.take(list(input_questions), fn_indices)[index]

In [None]:
display_false_negatives(original_train['target'][train_size:valid_size], y_valid, X[train_size:valid_size], index = 0)
# change the index to see different samples

#### Display Results
Let's see some of our samples from the testing data (includes input and output values)

In [None]:
# display insincere samples
display_samples(1, X, valid_size, y_pred)

In [None]:
# display sincere samples
display_samples(0, X, valid_size, y_pred)

# Future Work

#### To improve performance:

TODO: cross validate data by using training and testing on differnet chunks of the data
* example: x = [1, 2, 3, 4, 5, 6, 7, 8, 9 , 10]
    define train size = 2
    run the model on  the following splits and compoare the 
    * train = x[0:8]; validation = x[8:10]
    * train = x[1:9]; validation = [x[0], x[9]]
    * train = x[2:10], validation = [x[0:2]]
    * validation = [x[0], x[8]]; train = the rest
    * validation = [x[1], x[4]]; train = the rest
    * finish for all combinations

The final accuracy = the average of the accuracies from each split.
It's the same for recall and precision.

In [None]:
# TODO: create features with n-grams (instead of using words as feature columns, use phrases with n words)

In [None]:
# TODO: try different models if the runtime isn't too long
# try SVM, it generally has good performance for most classification problems

#### To explore and visualize the data:

In [None]:
# bigram & trigram plots

In [None]:
# implement ROC curve for each of our cross validation datasets

In [None]:
# how many inscinere/sincere questions has a particular word?

In [None]:
# which words produce the highest precision? recall? as in search for all the sentences with 'blacks' - call them insincere

In [None]:
# TODO: group questions into themes: e.g. racism, sexism, homophobic, sexuality, religious intolerance, 
# prejudice towards immigrants, unproductive political criticism
# Visualize this data
# Show percentages of each type

# what we define these themes to be: 
    # look for all insincere questions that include a word denoting an ethnic group (e.g. blacks, jews, muslims)
    # look for all insincere questions that include a word denoting a sex(e.g. woman, women, man, men)
    # look for all insincere questions that include a word denoting a sexual preference(e.g gay, homo, fag, trans, transgender, LGBTQ)
    # look for all insincere questions that include a word denoting a religious group*( e.g. religious people, jews, muslims, christians)
    # look for all insincere questions that include a word denoting a sexual term (sexy, gay, kiss, etc.)
    # look for all insincere questions that include a word denoting immigrants (immigrants, fabs, fresh off the boat)
    # look for all insincere questions that include words(s) or phrases(s) denoting a politcal topic (liberals, libs, libtards, democrats, republicans, conservatives, nazis, neo-nazis, president's name, politcal commentators, etc.)

In [None]:
# larger question for discussion section: although people make such discriminatory remarks, should we censor the online public?
# what are the implications of online censorship? 
    # Argument against censorship: people have dark, deep thoughts that may be contentious but perhaps they truly believe 
    # those thoughts and are genuinely curious. The online public could offer a medium to express and discuss those thoughts. 
    # One key advantage of online forums is the option to be anonymous, which makes one more open to share dark ideas. 