In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [3]:
data = pd.read_csv("data.csv", sep=';').set_index('ID')
data.head()

Unnamed: 0_level_0,Question
ID,Unnamed: 1_level_1
1,Как зовут лодочника на реке Стикс в древнегреч...
2,Как в химии обозначается свинец?
3,Какой химический элемент преобладает в составе...
4,Кто из перечисленных был пажом во времена Екат...
5,Когда началась 2 мировая война?


In [5]:
train = pd.merge(data, pd.read_csv("train.csv", sep=';').set_index('ID'), on='ID').drop_duplicates("Question")
test = pd.merge(data, pd.read_csv("test.csv", sep=';').set_index('ID'), on='ID')

In [55]:
import string

train_df = pd.merge(data, pd.read_csv("train.csv", sep=';').set_index('ID'), on='ID').drop_duplicates("Question")
test_df = pd.merge(data, pd.read_csv("test.csv", sep=';').set_index('ID'), on='ID')

## Number of words in the text ##
train_df["num_words"] = train_df["Question"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["Question"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["Question"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["Question"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["Question"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["Question"].apply(lambda x: len(str(x)))

## Number of punctuations in the text ##
train_df["num_punctuations"] =train_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of letters upper
train_df["num_letters_upper"] = train_df["Question"].apply(lambda x: len([w for w in str(x) if w.isupper()]))
test_df["num_letters_upper"] = test_df["Question"].apply(lambda x: len([w for w in str(x) if w.isupper()]))

# ## Number of letters upper
# train_df["num_letters_eng"] = train_df["Question"].apply(lambda x: len([w for w in str(x) if w.lower() in 'abcdefghijklmnopqrstuvwxyz']))
# test_df["num_letters_eng"] = test_df["Question"].apply(lambda x: len([w for w in str(x) if w.lower() in 'abcdefghijklmnopqrstuvwxyz']))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["Question"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["Question"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

## Number of digits
train_df["num_digits"] =train_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.digits]))
test_df["num_digits"] =test_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.digits]))

## if ends with ?
train_df["end?"] = train_df['Question'].apply(lambda x: int(x[-1] == '?'))
test_df["end?"] = test_df['Question'].apply(lambda x: int(x[-1] == '?'))

# question word
def find_question_word(s):
    question_words = ['как', 'какой', 'какая', 'какие', 'какое', 'когда', 'где', 'кто', 'что', 'сколько', 'чего', 'кого', 'кем', 'чем', 'кому' ]
    k = -1
    s = s.lower()
    for i in range(len(question_words)):
        if question_words[i] in s:
            k = i
    return k
train_df["question_word"] = train_df['Question'].apply(lambda x: find_question_word(x))
test_df["question_word"] = test_df['Question'].apply(lambda x: find_question_word(x))


train_df.drop_duplicates(subset=['Question', 'Answer'], inplace=True)

In [56]:
train_df

Unnamed: 0_level_0,Question,Answer,num_words,num_unique_words,num_chars,num_punctuations,num_words_upper,num_letters_upper,num_letters_eng,num_words_title,mean_word_len,num_digits,end?,question_word
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Как зовут лодочника на реке Стикс в древнегреч...,0,9,9,62,1,0,2,0,2,6.000000,0,1,0
2,Как в химии обозначается свинец?,1,5,5,32,1,0,1,0,1,5.600000,0,1,0
3,Какой химический элемент преобладает в составе...,0,8,8,62,1,0,1,0,1,6.875000,0,1,1
4,Кто из перечисленных был пажом во времена Екат...,0,9,9,55,1,1,4,2,2,5.222222,0,1,7
5,Когда началась 2 мировая война?,0,5,5,31,1,0,1,0,1,5.400000,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,Как называют жителей города Гусь Хрустальный?,0,6,6,45,1,0,3,0,3,6.666667,0,1,0
29996,"Как называется игра, местом действия которой я...",0,9,9,69,2,0,2,0,2,6.777778,0,1,0
29997,Как в опере называют ведущую певицу?,0,6,6,36,1,0,1,0,1,5.166667,0,1,0
29998,На какой планете сутки длиннее года?,0,6,6,36,1,0,1,0,1,5.166667,0,1,1


In [58]:
train_df[train_df['num_letters_eng'] > 0]

Unnamed: 0_level_0,Question,Answer,num_words,num_unique_words,num_chars,num_punctuations,num_words_upper,num_letters_upper,num_letters_eng,num_words_title,mean_word_len,num_digits,end?,question_word
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4,Кто из перечисленных был пажом во времена Екат...,0,9,9,55,1,1,4,2,2,5.222222,0,1,7
41,Что означает h - в физике,0,6,6,25,1,0,1,1,1,3.333333,0,0,8
89,Сколько клубов играет в Bundesliga?,0,5,5,35,1,0,2,10,2,6.200000,0,1,9
96,В каком году родился Willuam Genry Gates третий,0,8,8,47,0,1,4,17,4,5.000000,0,0,0
106,Каким было первое видео на YouTube?,0,6,6,35,1,0,3,7,1,5.000000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29969,В каком году российский футбольный клуб ЦСКА в...,0,10,10,64,1,3,9,4,1,5.500000,0,1,0
29973,В каком учебном заведении училась главная геро...,0,12,12,88,1,1,4,17,4,6.416667,0,1,0
29977,Какая страна является родиной музыкальной рок-...,1,7,7,60,4,1,5,4,1,7.714286,0,1,2
29981,Сколько частей франшизы Grand Theft Auto вышл...,0,10,10,65,1,0,4,14,4,5.500000,0,1,9


True

In [61]:
def write_answer(clf):
    answ = pd.read_csv('baseline_solution.csv').set_index('v')
    answ['a'] = clf.predict_proba(test_df.drop(columns="Question"))[:,1]
    answ.to_csv('result.csv', header=False)

In [59]:
gb = GradientBoostingClassifier()
param = {'learning_rate' : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'n_estimators' : [5, 10, 25, 50, 100, 150, 250, 500]
        }
gs = GridSearchCV(gb, param_grid=param, cv=5, scoring='roc_auc', verbose=True, n_jobs=-1)
gs.fit(train_df.drop(columns=['Answer', 'Question']), train_df['Answer'])
gs.best_score_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  3.0min finished


0.7632190440942228

Best score by now: 0.7613742211275701

In [None]:
gs.best_estimator_.feature_importances_

In [62]:
write_answer(gs.best_estimator_)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['Question', 'Answer']), train_df['Answer'], test_size=0.3)

In [75]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(silent=True)
cb.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x25551e33d88>

In [76]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, cb.predict_proba(X_test)[:,1])

0.7153550981125738

In [16]:
cb.fit(train_df.drop(columns=['Answer', 'Question']), train_df['Answer'])

<catboost.core.CatBoostClassifier at 0x2555186b4c8>

In [17]:
answ = pd.read_csv('baseline_solution.csv').set_index('v')
answ['a'] = cb.predict_proba(test.drop(columns="Question"))[:,1]
answ.to_csv('result.csv', header=False)