In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_q = train_data['question_text'].values
test_q = test_data['question_text'].values

In [4]:
import nltk
from nltk.corpus import stopwords

In [5]:
stopwords = set(stopwords.words('english'))

In [6]:
train_q_tokenized = [word_tokenize(ques.lower()) for ques in tqdm(train_q)]
test_q_tokenized = [word_tokenize(ques.lower()) for ques in tqdm(test_q)]

100%|██████████| 1306122/1306122 [02:36<00:00, 8358.53it/s]
100%|██████████| 56370/56370 [00:06<00:00, 8342.22it/s]


In [7]:
filtered_train = [[q for q in ques if q not in stopwords] for ques in tqdm(train_q_tokenized)]
filtered_test = [[q for q in ques if q not in stopwords] for ques in tqdm(test_q_tokenized)]

100%|██████████| 1306122/1306122 [00:05<00:00, 229598.37it/s]
100%|██████████| 56370/56370 [00:00<00:00, 472348.14it/s]


In [8]:
train_ques = [" ".join(ques) for ques in filtered_train]
test_ques = [" ".join(ques) for ques in filtered_test]

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer_uni = TfidfVectorizer(max_features=300000)
vectorizer_bi = TfidfVectorizer(ngram_range=(1,2),max_features=300000)
vectorizer_tri = TfidfVectorizer(ngram_range=(1,3),max_features=300000)
vectorizer_4gram = TfidfVectorizer(ngram_range=(1,4),max_features=300000)
vectorizer_5gram = TfidfVectorizer(ngram_range=(1,5),max_features=300000)
vectorizer_6gram = TfidfVectorizer(ngram_range=(1,6),max_features=300000)

In [21]:
BoW_unigram = vectorizer_uni.fit_transform(train_ques)
BoW_bigram = vectorizer_bi.fit_transform(train_ques)
BoW_trigram = vectorizer_tri.fit_transform(train_ques)
BoW_4gram = vectorizer_4gram.fit_transform(train_ques)
BoW_5gram = vectorizer_5gram.fit_transform(train_ques)
BoW_6gram = vectorizer_6gram.fit_transform(train_ques)

In [22]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=25,n_iter=100)

In [26]:
truncated_uni_bow = svd.fit_transform(BoW_unigram)
truncated_bi_bow = svd.fit_transform(BoW_bigram)
truncated_tri_bow = svd.fit_transform(BoW_trigram)
truncated_4_bow = svd.fit_transform(BoW_4gram)
truncated_5_bow = svd.fit_transform(BoW_5gram)
truncated_6_bow = svd.fit_transform(BoW_6gram)

In [19]:
import numpy as np
np.shape(BoW_unigram)

(1306122, 194974)

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [29]:
zipped = list(zip(truncated_tri_bow, train_data['target'].values))
np.random.shuffle(zipped)
X = [data[0] for data in zipped]
Y = [data[1] for data in zipped]

In [30]:
nn_class1 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)
nn_class2 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)
nn_class3 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)
nn_class4 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)
nn_class5 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)
nn_class6 = MLPClassifier(hidden_layer_sizes=(100,100,100,),activation='relu',solver='adam',learning_rate_init=0.0001)

In [None]:
# mlp_classifier1 = nn_class1.fit(X[:1000000],Y[:1000000])
# mlp_classifier2 = nn_class2.fit(X[:1000000],Y[:1000000])
mlp_classifier3 = nn_class3.fit(X[:1000000],Y[:1000000])
# mlp_classifier4 = nn_class4.fit(X[:1000000],Y[:1000000])
# mlp_classifier5 = nn_class5.fit(X[:1000000],Y[:1000000])
# mlp_classifier6 = nn_class6.fit(X[:1000000],Y[:1000000])

In [None]:
len(train_data['target'].values)*0.8

In [None]:
import pickle as pkl
pkl.dump(list(zip(X[:1000000],Y[:1000000])), open('10k_train.pkl','wb'))
pkl.dump(list(zip(X[1000000:],Y[1000000:])), open('10k_test.pkl','wb'))

In [None]:
pkl.dump(mlp_classifier, open('model_10k_mlp_100100100.pkl','wb'))

In [None]:
y_pred = mlp_classifier.predict(X[:1000000])

In [None]:
error = np.count_nonzero(y_pred - Y[:1000000])/len(Y[:1000000])

In [None]:
train_data['target'].describe()

In [None]:
sincere_q = train_data.loc[train_data['target'] == 0]
insincere_q = train_data.loc[train_data['target'] == 1]

In [None]:
print("sincere questions: " + str(len(sincere_q)))
print("insincere questions: " + str(len(insincere_q)))

In [None]:
len(sincere_q)

In [None]:
from nltk import ngrams, FreqDist

In [None]:
all_insincere = " ".join(insincere_q['question_text'].values)
all_sincere = " ".join(sincere_q['question_text'].values)
in_tokenized = cust_tokenize(all_insincere.lower())
in_tokenized = [x for x in in_tokenized if x not in [',','.','?',';','!']]
sin_tokenized = cust_tokenize(all_sincere.lower())
sin_tokenized = [x for x in sin_tokenized if x not in [',','.','?',';','!']]
sincere_uni_counts = FreqDist(ngrams(sin_tokenized, 1))
insincere_uni_counts = FreqDist(ngrams(in_tokenized, 1))
sincere_bi_counts = FreqDist(ngrams(sin_tokenized, 2))
insincere_bi_counts = FreqDist(ngrams(in_tokenized, 2))
sincere_tri_counts = FreqDist(ngrams(sin_tokenized, 3))
insincere_tri_counts = FreqDist(ngrams(in_tokenized, 3))
sincere_4gram_counts = FreqDist(ngrams(sin_tokenized, 4))
insincere_4gram_counts = FreqDist(ngrams(in_tokenized, 4))

In [None]:
in_keys = insincere_uni_counts.keys()
sin_keys = sincere_uni_counts.keys()
for key in list(in_keys):
    if(len(key[0]) < 2):
        del insincere_uni_counts[key]
    if(key[0] in stopwords):
        del insincere_uni_counts[key]
        
for key in list(sin_keys):
    if(len(key[0]) < 2):
        del sincere_uni_counts[key]
    if(key[0] in stopwords):
        del sincere_uni_counts[key]

In [None]:
sincere_uni_counts

In [None]:
sincere_bi_counts

In [None]:
sincere_tri_counts

In [None]:
sincere_4gram_counts

In [None]:
insincere_uni_counts

In [None]:
insincere_bi_counts

In [None]:
insincere_tri_counts

In [None]:
insincere_4gram_counts