In [1]:
# Perform necessary imports and load training dataset
import numpy as np
import pandas as pd
df_data = pd.read_csv('train.csv')
df_data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [2]:
# Drop question ID column as it does not provide useful data
del df_data['qid']
df_data.head()

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0


In [3]:
# Check dataset size
len(df_data)

1306122

In [4]:
# Get equal amounts sincere and insincere questions and create new dataframe
sincere = df_data[df_data['target'] == 0].head(1000)
insincere = df_data[df_data['target'] == 1].head(1000)
df_data = pd.concat([sincere, insincere])
df_data

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0
...,...,...
15906,Where can I find Trump toilet paper to wipe my...,1
15928,Why are Pakistani citizens so ignorant? Are th...,1
15929,Will the US Navy ever admit to utilizing Navy ...,1
15973,"Which girls are more pretty, Punjabi or Gujarati?",1


In [5]:
# Check for any missing values
df_data.isnull().sum().sum()

0

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize and remove stop words
vectorizer = CountVectorizer(ngram_range=(1,3), stop_words='english')
X = vectorizer.fit_transform(df_data['question_text'])
bag_of_words = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
bag_of_words

Unnamed: 0,00,00 morning,000,000 reading,000 reading white,000 year,000 years,000 years mid,000 years old,000 years religious,...,zone,zoosadism,zoroastrians,zoroastrians living,zoroastrians living peacefully,zuckerberg,zuckerberg really,zuckerberg really half,zx,zx 10r
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split
# Separate into training and testing datasets, 80/20
train_x, test_x, train_y, test_y = train_test_split(bag_of_words, df_data['target'], test_size=0.2, random_state=523)
# Check lengths
print(len(train_x))
print(len(train_y))
print(len(test_x))
print(len(test_y))

1600
1600
400
400


In [8]:
from sklearn.feature_selection import *
# Compute the f value and p value of the chi-squared test between each attribute and the class
f_val, p_val = chi2(train_x, train_y)

# Print the chi-squared values and p values
df_scores = pd.DataFrame(zip(train_x.columns, f_val, p_val), columns=['feature', 'chi2', 'p'])
df_scores['chi2'] = df_scores['chi2'].round(2)
df_scores['p'] = df_scores['p'].round(3)
print(df_scores.sort_values('chi2', ascending=False))

# Use features with p < 0.05
sel_cols = df_scores[df_scores['p']<0.05]['feature'].values
print('\nSelected features: %d' % len(sel_cols))
print(sel_cols)

                      feature   chi2    p
18488                  people  76.92  0.0
25448                   trump  69.67  0.0
2671                     best  39.78  0.0
27340                   women  33.16  0.0
16893                 muslims  26.70  0.0
...                       ...    ...  ...
27867                    zero    NaN  NaN
27868          zero integrity    NaN  NaN
27869  zero integrity honesty    NaN  NaN
27887                      zx    NaN  NaN
27888                  zx 10r    NaN  NaN

[27889 rows x 3 columns]

Selected features: 253
['100' '2017' '2018' 'account' 'admit' 'african' 'ago' 'aliens' 'america'
 'american' 'americans' 'animals' 'answers' 'anti' 'appear' 'apply' 'aren'
 'asian' 'ass' 'atheist' 'atheists' 'aunt' 'away' 'believe' 'best' 'bike'
 'bitcoin' 'black' 'black people' 'blacks' 'blind' 'blood' 'body' 'boys'
 'branch' 'breaks' 'called' 'campaign' 'car' 'cars' 'causes' 'chances'
 'characteristics' 'child' 'children' 'chinese' 'christians' 'clinton'
 'cold' 

In [9]:
# Output significant features, sorted by p value, to csv
df_scores[df_scores['p'] < 0.05].sort_values('p').to_csv(path_or_buf='df_scores.csv')

In [10]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

classifier = []
fs = []
acc = []
prec = []
rec = []
f1 = []

# Function to train a given model with/without feature selection, test model, and store results in dataframe
def train(clf, fs_flag, clf_name):
    if (fs_flag):
        model = clf.fit(train_x[sel_cols], train_y)
        pred_y = model.predict(test_x[sel_cols])
        fs.append('yes')
    else:
        model = clf.fit(train_x, train_y)
        pred_y = model.predict(test_x)
        fs.append('no')
    classifier.append(clf_name)
    acc.append(round(accuracy_score(pred_y, test_y), 2))
    prec.append(round(precision_score(pred_y, test_y), 2))
    rec.append(round(recall_score(pred_y, test_y), 2))
    f1.append(round(f1_score(pred_y, test_y), 2))

In [11]:
# k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
# With feature selection
clf = KNeighborsClassifier()
train(clf, True, 'k-nearest neighbors')
# With all features
clf = KNeighborsClassifier()
train(clf, False, 'k-nearest neighbors')

# Decision tree
from sklearn import tree
# With feature selection
clf = tree.DecisionTreeClassifier()
train(clf, True, 'decision tree')
# With all features
clf = tree.DecisionTreeClassifier()
train(clf, False, 'decision tree')

# Naive bayes
from sklearn.naive_bayes import MultinomialNB
# With feature selection
clf = MultinomialNB()
train(clf, True, 'naive bayes')
# With all features
clf = MultinomialNB()
train(clf, False, 'naive bayes')

# Support vector machine
from sklearn import svm
# With feature selection
clf = svm.SVC()
train(clf, True, 'support vector machine')
# With all features
clf = svm.SVC()
train(clf, False, 'support vector machine')

# print all scores
scores = pd.DataFrame({'classifier': classifier, 'feature selection': fs, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1':f1})
scores

Unnamed: 0,classifier,feature selection,accuracy,precision,recall,f1
0,k-nearest neighbors,yes,0.68,0.47,0.84,0.6
1,k-nearest neighbors,no,0.49,0.01,1.0,0.03
2,decision tree,yes,0.74,0.62,0.85,0.72
3,decision tree,no,0.79,0.74,0.84,0.79
4,naive bayes,yes,0.79,0.79,0.8,0.8
5,naive bayes,no,0.74,0.71,0.77,0.74
6,support vector machine,yes,0.8,0.76,0.84,0.8
7,support vector machine,no,0.73,0.72,0.75,0.73


In [12]:
# Sort by F1
sorted_results = scores.sort_values(by='f1', ascending=False)
sorted_results

Unnamed: 0,classifier,feature selection,accuracy,precision,recall,f1
4,naive bayes,yes,0.79,0.79,0.8,0.8
6,support vector machine,yes,0.8,0.76,0.84,0.8
3,decision tree,no,0.79,0.74,0.84,0.79
5,naive bayes,no,0.74,0.71,0.77,0.74
7,support vector machine,no,0.73,0.72,0.75,0.73
2,decision tree,yes,0.74,0.62,0.85,0.72
0,k-nearest neighbors,yes,0.68,0.47,0.84,0.6
1,k-nearest neighbors,no,0.49,0.01,1.0,0.03


In [13]:
# Output results to csv
sorted_results.to_csv(path_or_buf='sorted_results.csv')