In [1]:
#The Binary One (term presence)
import pandas as pd 
import numpy as np
import nltk
import re

#imports stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhbow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#The instructor told use to just use the training data set and use the highest amount of data our systems could handle.
#I used the first 30,000 rows of the train data set.
data_file = "C:\\Users\\jhbow\\Desktop\\0-senProj\\train30k.csv"

#create data frame
df_data = pd.read_csv(data_file, sep=",", encoding ='utf-8')

#a look at the data types
df_data.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [3]:
#show a visual rep of the first 50 rows of data frame
df_data.head(50)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
#creates a column of both questions combined seperated with a space
df_data["both"]=df_data["question1"]+str(" ")+df_data["question2"]

#show data frame with the new "both" column
df_data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,both
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely? How can I solve...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar, salt..."
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,Astrology: I am a Capricorn Sun Cap moon and c...
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0,Should I buy tiago? What keeps childern active...
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,How can I be a good geologist? What should I d...
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,When do you use シ instead of し? When do you us...
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,Motorola (company): Can I hack my Charter Moto...


In [5]:
# Creating a 'bag of words' with a count vectorizer this will extract every word used in the questions into an array. 
# You can see I am using a binary parameter which will give everything a 1 or 0.
# So in the dense matrix for every "both" varible, it will check each word and if that term
# is present in in that row it will place 1 if not it will return 0.
# You can also see I used other parameters. These are for cleaning to make it more efficient.
# the stop_words parameter ignores common words such as the, he, she, what, how, etc.
# the other ones were to get rid of non-English text
vectorizer = CountVectorizer(binary=True, stop_words='english', encoding='utf-8', strip_accents='ascii')
x = vectorizer.fit_transform(df_data['both'].values.astype('U'))
df_tf = pd.DataFrame(x.todense(), columns=vectorizer.get_feature_names())
df_tf

Unnamed: 0,00,000,000k,0021,003sc,008,01,013j,02,03,...,zoroastrian,zr,ztc,zte,zuckerberg,zuma,zx,zygote,zyl,zynga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#Partitioning the data set for testing
train_x, test_x, train_y, test_y = train_test_split(df_tf, df_data["is_duplicate"], test_size=0.3, random_state=123)


# convert numpy arrays to data frames
df_train_x = pd.DataFrame(train_x, columns=df_tf.columns)
df_test_x = pd.DataFrame(test_x, columns=df_tf.columns)
df_train_y = pd.DataFrame(train_y, columns=["is_duplicate"])
df_test_y = pd.DataFrame(test_y, columns=["is_duplicate"])

print "shapes"
print df_train_x.shape
print df_test_x.shape
print df_train_y.shape
print df_test_y.shape
print 
print "class counts"
print df_data["is_duplicate"].value_counts()
print df_train_y["is_duplicate"].value_counts()
print df_test_y["is_duplicate"].value_counts()

shapes
(21000, 25776)
(9000, 25776)
(21000, 1)
(9000, 1)

class counts
0    18843
1    11157
Name: is_duplicate, dtype: int64
0    13169
1     7831
Name: is_duplicate, dtype: int64
0    5674
1    3326
Name: is_duplicate, dtype: int64


In [8]:
#Modeling the data and prediction
#Using the Logistic Regression model
clf = LogisticRegression()
clf = clf.fit(df_train_x, train_y)
pred_y = clf.predict(df_test_x)
print ("f1:" + str(f1_score(pred_y, test_y)))
print ("accuracy:" + str(accuracy_score(pred_y, test_y)))
print ("precision:" + str(precision_score(pred_y, test_y)))
print ("recall:" + str(recall_score(pred_y, test_y)))



f1:0.5878442545109213
accuracy:0.7106666666666667
precision:0.55832832230908
recall:0.6206550802139037


In [None]:
#I tried many different methods and combos of preprocessing and models and this was the best results.
#Due to time constraints, I was unable to improve it further.
#Perhaps, a little more cleaning could be done to yield even better results.