# 文章の基幹化

In [None]:
from nltk import stem
from nltk.corpus import stopwords
import re
import pandas as pd


def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    if pd.isnull(text):
        return ''
    text = text.lower()
    stops = set(stopwords.words("english"))

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0rs", "rs", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"j k", "jk", text)

    if remove_stopwords:
        text = text.split()
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    if stem_words:
        text = text.split()
        stemmer = stem.SnowballStemmer("english")
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    return text


train_data = pd.read_csv("/Users/user/documents/quora/train.csv")
test_data = pd.read_csv("/Users/user/documents/quora/test.csv")

train_stem = list()
test_stem = list()

for i in range(len(train_data)):
    train_stem.append(text_to_wordlist(train_data["question1"][i]))
    train_stem.append(text_to_wordlist(train_data["question2"][i]))

for i in range(len(test_data)):
    test_stem.append(text_to_wordlist(test_data["question1"][i]))
    test_stem.append(text_to_wordlist(test_data["question2"][i]))

# 文章間の単語適合率

In [51]:
from nltk.corpus import stopwords

def read_txt(file):
    tmp = list()
    with open(file, 'r') as f:
        for line in f:
            line.replace('\n', "")
            tmp.append(line)
    return tmp

def match_ratio(t1, t2):
    stops = set(stopwords.words("english"))
    q1 = dict()
    q2 = dict()
    for word in str(t1.lower().split()):
        if word not in stops:
            q1[word] = 1
    for word in str(t2.lower().split()):
        if word not in stops:
            q2[word] = 1
    if len(q1) == 0 or len(q2) == 0:
        return 0
    match_words = [w for w in q1.keys() if w in q2]
    return (len(match_words) / (len(q1) + len(q2)))

train_match = list()
test_match = list()
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for train in zip(*[iter(train_stem)] * 2):
    train_match.append(match_ratio(train[0], train[1]))
train_df["word_match"] = train_match

for test in zip(*[iter(test_stem)] * 2):
    test_match.append(match_ratio(test[0], test[1]))
test_df["word_match"] = test_match

# tfidfの計算

In [14]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, max_df=0.5)
train_tfidf = vectorizer.fit_transform(train_stem)
test_tfidf = vectorizer.transform(test_stem)

with open("train_tfidf.pickle", 'wb') as f:
    pickle.dump(train_tfidf, f)

with open("test_tfidf.pickle", 'wb') as f:
    pickle.dump(test_tfidf, f)

# tfidfのcos類似度の計算

In [None]:
import pickle
import numpy as np

def pickle_load(file):
    with open(file, 'rb') as f:
        obj = pickle.load(f)
    return obj

def cos_sim(v1, v2):
    a = 0
    b = 0
    dot = 0
    for value in v1.data:
        a += value * value
        a = np.sqrt(a)

    for value in v2.data:
        b += value * value
    b = np.sqrt(b)

    for word_a, value_a in zip(v1.indices, v1.data):
        for word_b, value_b in zip(v2.indices, v2.data):
            if word_a == word_b:
                dot += value_a * value_b
    return (dot / (a * b)) if (a * b) else 0

result = list()
train_r = list()

x = pickle_load("train_tfidf.pickle")
y = pickle_load("test_tfidf.pickle")

for vec in zip(*[iter(x)] * 2):
    train_r.append(cos_sim(vec[0], vec[1]))
train_df["tfidf_sim"] = train_r

for vec in zip(*[iter(y)] * 2):
    result.append(cos_sim(vec[0], vec[1]))
test_df["tfidf_sim"] = result

# 重要単語のtfidf値の差

In [None]:
def dis_cal(t1, t2):
    return (1 - abs(t1-t2))**2

train_r = list()
test_r = list()
for vec in zip(*[iter(x)] * 2):
    if vec[0].getnnz() == 0 or vec[1].getnnz() == 0:
        train_r.append(0)
    else:
        train_r.append(dis_cal(vec[0].data.max(), vec[1].data.max()))
train_df["word_dis"] = train_r

for vec in zip(*[iter(y)] * 2):
    if vec[0].getnnz() == 0 or vec[1].getnnz() == 0:
        train_r.append(0)
    else:
        test_r.append((dis_cal(vec[0].data.max(), vec[1].data.max())))
test_df["word_dis"] = test_r

# 分類器の作成

In [6]:
def random_forest(x_train, y_train, x_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    forest = RandomForestClassifier(n_estimators=500, 
                                    criterion='gini', 
                                    max_depth=None, 
                                    min_samples_split=2, 
                                    min_samples_leaf=100, 
                                    max_features='sqrt', 
                                    max_leaf_nodes=None, 
                                    bootstrap=True, 
                                    oob_score=False, 
                                    n_jobs=1, 
                                    random_state=None, 
                                    verbose=0)
    forest.fit(x_train, y_train)
    print(forest.score(x_test, y_test))
    y_test_pred = forest.predict_proba(x_test)
    return y_test_pred

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

y_test = random_forest(train_df, train_data['is_duplicate'], test_df)

sub = pd.DataFrame()
sub['test_id'] = test_data['test_id']
sub['is_duplicate'] = y_test[:,1]
sub.to_csv('submit.csv', index=False)

0.69524351332
