# 元データ読み込み

In [4]:
import pandas as pd

train_data = pd.read_csv("/Users/user/documents/quora/train.csv")
test_data = pd.read_csv("/Users/user/documents/quora/test.csv")

train_data.fillna(0)
test_data.fillna(0)

train_df = pd.DataFrame()
test_df = pd.DataFrame()

# questionの文の長さを比較

In [7]:
train_word_len = list()
test_word_len = list()

for i in range(len(train_data['question1'])):
    q1_len = len(str(train_data['question1'][i])) if train_data['question1'][i] else 0
    q2_len = len(str(train_data['question2'][i])) if train_data['question2'][i] else 0
    dis = abs(q1_len - q2_len)
    train_word_len.append(dis)

for i in range(len(test_data['question1'])):
    q1_len = len(str(test_data['question1'][i])) if test_data['question1'][i] else 0
    q2_len = len(str(test_data['question2'][i])) if test_data['question2'][i] else 0
    dis = abs(q1_len - q2_len)
    test_word_len.append(dis)
    
train_df['word_len'] = train_word_len
test_df['word_len'] = test_word_len

# 文章の単語の一致率

In [66]:
from nltk.corpus import stopwords

def read_txt(file):
    tmp = list()
    with open(file, 'r') as f:
        for line in f:
            line.replace('\n', "")
            tmp.append(line)
    return tmp

def match_ratio(t1, t2):
    stops = set(stopwords.words("english"))
    q1 = dict()
    q2 = dict()
    for word in str(t1.lower().split()):
        if word not in stops:
            q1[word] = 1
    for word in str(t2.lower().split()):
        if word not in stops:
            q2[word] = 1
    if len(q1) == 0 or len(q2) == 0:
        return 0
    match_words = [w for w in q1.keys() if w in q2]
    return (len(match_words)*2 / (len(q1) + len(q2)))

train_text = read_txt("stem_train.txt")
test_text = read_txt("stem_test.txt")
train_match = list()
test_match = list()

for train in zip(*[iter(train_text)] * 2):
    train_match.append(match_ratio(train[0], train[1]))
train_df["word_match"] = train_match

for test in zip(*[iter(test_text)] * 2):
    test_match.append(match_ratio(test[0], test[1]))
test_df["word_match"] = test_match

# 文章のtfidf値のcos類似度

In [8]:
import pickle
import numpy as np

def pickle_load(file):
    with open(file, 'rb') as f:
        obj = pickle.load(f)
    return obj

def cos_sim(v1, v2):
    a = 0
    b = 0
    dot = 0
    for value in v1.data:
        a += value * value
        a = np.sqrt(a)

    for value in v2.data:
        b += value * value
    b = np.sqrt(b)

    for word_a, value_a in zip(v1.indices, v1.data):
        for word_b, value_b in zip(v2.indices, v2.data):
            if word_a == word_b:
                dot += value_a * value_b
    return (dot / (a * b)) if (a * b) else 0

result = list()
train_r = list()

x = pickle_load("train_tfidf.pickle")
y = pickle_load("test_tfidf.pickle")
for vec in zip(*[iter(x)] * 2):
    train_r.append(cos_sim(vec[0], vec[1]))
train_df["tfidf_sim"] = train_r
for vec in zip(*[iter(y)] * 2):
    result.append(cos_sim(vec[0], vec[1]))
test_df["tfidf_sim"] = result

# 特徴語のtfidf値比較

In [13]:
import pickle
import pandas as pd

def pickle_load(file):
    with open(file, 'rb') as f:
        obj = pickle.load(f)
    return obj


def dis_cal(t1, t2):
    return (1 - abs(t1-t2))**2

x = pickle_load("train_tfidf.pickle")
y = pickle_load("test_tfidf.pickle")

train_r = list()
test_r = list()

for vec in zip(*[iter(x)] * 2):
    if vec[0].getnnz() == 0 or vec[1].getnnz() == 0:
        train_r.append(0)
    else:
        train_r.append(dis_cal(vec[0].data.max(), vec[1].data.max()))
train_df["word_dis"] = train_r

for vec in zip(*[iter(y)] * 2):
    if vec[0].getnnz() == 0 or vec[1].getnnz() == 0:
        test_r.append(0)
    else:
        test_r.append((dis_cal(vec[0].data.max(), vec[1].data.max())))
test_df["word_dis"] = test_r

# 前置詞のストップワードを作成

In [15]:
stops = {"above", "after", "against", "among", "anti", "at", "before", "behind", "below",
        "beside", "besides", "between", "beyond", "but", "by", "considering","despite", "during",
        "for", "from", "in", "inside", "into", "less", "near", "of", "on", "onto", "oppesite",
        "outside", "over", "past", "regarding", "since", "throughout", "till", "under", "underneath", "unless",
        "unlike", "untill", "via", "with", "within", "without"}

# 文章の前置詞の数を比較

In [17]:
def count_pre(t1,t2):
    q1 = dict()
    q2 = dict()
    
    for word in t1.lower().split():
        if word in stops:
            q1[word] = 1
    
    for word in t2.lower().split():
        if word in stops:
            q2[word] = 1
    
    return abs(len(q1) - len(q2))

train_pre = list()
test_pre = list()

for i in range(0, len(train_data)-1):
    tmp = count_pre(str(train_data['question1'][i]), str(train_data['question2'][i]))
    train_pre.append(tmp)
tmp = count_pre(str(train_data['question1'][len(train_data)-1]), str(train_data['question2'][len(train_data)-1]))
train_pre.append(tmp)
train_df['count_pre'] = train_pre

for i in range(0, len(test_data)-1):
    tmp = count_pre(str(test_data['question1'][i]), str(test_data['question2'][i]))
    test_pre.append(tmp)
tmp = count_pre(str(test_data['question1'][len(train_data)-1]), str(test_data['question2'][len(test_data)-1]))
test_pre.append(tmp)
test_df['count_pre'] = test_pre

# ニューラルネットワークによる学習

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import pandas as pd

# nan削除
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# テスト用
#X_train, X_test, y_train, y_test = train_test_split(train_df, train_data, test_size=0.3, random_state=0)

# MLPClassifier パラメーター設定
clf = MLPClassifier(activation='relu', alpha=0.00011, batch_size=50, beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(50,50,50,50,50,), learning_rate='constant',
       learning_rate_init=0.0044, max_iter=1000, random_state=123,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# テスト用　学習
#clf.fit(X_train, y_train['is_duplicate'])
#print(clf.score(X_test, y_test['is_duplicate']))
#result = clf.predict_proba(X_test)
#y_test['result'] = result[:,1]

# 学習、モデルセーブ
clf.fit(train_df, train_data['is_duplicate'])
joblib.dump(clf, "5feature_model.sav")

# モデルロード
#model = joblib.load("5feature_model.sav")

# test predict
y_test = clf.predict_proba(test_df)

# 提出用データ作成
sub = pd.DataFrame()
sub['test_id'] = test_data['test_id']
sub['is_duplicate'] = y_test[:,1]
sub.to_csv('submit1.csv', index=False)