In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

In [2]:
data = pd.read_csv("data.csv", sep=';').set_index('ID')
train = pd.merge(data, pd.read_csv("train.csv", sep=';').set_index('ID'), on='ID').drop_duplicates("Question")
test = pd.merge(data, pd.read_csv("test.csv", sep=';').set_index('ID'), on='ID')

Creating new features (final features)

In [3]:
train_df = pd.merge(data, pd.read_csv("train.csv", sep=';').set_index('ID'), on='ID').drop_duplicates("Question")
test_df = pd.merge(data, pd.read_csv("test.csv", sep=';').set_index('ID'), on='ID')

## Number of words in the text ##
train_df["num_words"] = train_df["Question"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["Question"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["Question"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["Question"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["Question"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["Question"].apply(lambda x: len(str(x)))

## Number of punctuations in the text ##
train_df["num_punctuations"] =train_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of letters upper
train_df["num_letters_upper"] = train_df["Question"].apply(lambda x: len([w for w in str(x) if w.isupper()]))
test_df["num_letters_upper"] = test_df["Question"].apply(lambda x: len([w for w in str(x) if w.isupper()]))

# ## Number of letters lower
# train_df["num_letters"] = train_df["Question"].apply(lambda x: len([w for w in str(x) if w.isalpha()]))
# test_df["num_letters"] = test_df["Question"].apply(lambda x: len([w for w in str(x) if w.isalpha()]))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["Question"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["Question"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["Question"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

## Number of digits ##
train_df["num_digits"] =train_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.digits]))
test_df["num_digits"] =test_df['Question'].apply(lambda x: len([c for c in str(x) if c in string.digits]))

## If ends with ? ##
train_df["end?"] = train_df['Question'].apply(lambda x: int(x[-1] == '?'))
test_df["end?"] = test_df['Question'].apply(lambda x: int(x[-1] == '?'))

## Getting a question word ##
def find_question_word(s):
    question_words = ['как', 'какой', 'какая', 'какие', 'какое', 'когда', 'где', 'кто', 'что', 'сколько', 'чего', 'кого', 'кем', 'чем', 'кому' ]
    k = -1
    s = s.lower()
    for i in range(len(question_words)):
        if question_words[i] in s:
            k = i
    return k
train_df["question_word"] = train_df['Question'].apply(lambda x: find_question_word(x))
test_df["question_word"] = test_df['Question'].apply(lambda x: find_question_word(x))

In [4]:
train_df.head()

Unnamed: 0_level_0,Question,Answer,num_words,num_unique_words,num_chars,num_punctuations,num_words_upper,num_letters_upper,num_words_title,mean_word_len,num_digits,end?,question_word
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Как зовут лодочника на реке Стикс в древнегреч...,0,9,9,62,1,0,2,2,6.0,0,1,0
2,Как в химии обозначается свинец?,1,5,5,32,1,0,1,1,5.6,0,1,0
3,Какой химический элемент преобладает в составе...,0,8,8,62,1,0,1,1,6.875,0,1,1
4,Кто из перечисленных был пажом во времена Екат...,0,9,9,55,1,1,4,2,5.222222,0,1,7
5,Когда началась 2 мировая война?,0,5,5,31,1,0,1,1,5.4,1,1,5


In [5]:
X = train_df.drop(columns=['Question', 'Answer'])
y = train_df['Answer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [6]:
linreg = LinearRegression(normalize=True, n_jobs=-1, copy_X=True)
linreg.fit(X_train, y_train)
linreg_score = roc_auc_score(y_test, linreg.predict(X_test))

In [7]:
param = {'criterion' : ['gini', 'entropy'],
        'max_depth' : [1, 5, 10, 25, 50],
        'min_samples_split' : [2, 5 , 10, 20]
}
dt_gs = GridSearchCV(DecisionTreeClassifier(random_state=123), param_grid=param, cv=5, scoring='roc_auc', verbose=True, n_jobs=-1)
dt_gs.fit(X_train, y_train)
dt_score = roc_auc_score(y_test, dt_gs.best_estimator_.predict_proba(X_test)[:, 1])

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.8s finished


In [8]:
param = {'criterion' : ['gini', 'entropy'],
        'n_estimators' : [5, 10, 25, 50, 100, 150, 250, 500],
        'max_depth' : [1, 5, 10, 25, 50],
        'min_samples_split' : [2, 5 , 10, 20]
}
rf_gs = GridSearchCV(RandomForestClassifier(random_state=123), param_grid=param, cv=5, scoring='roc_auc', verbose=True, n_jobs=-1)
rf_gs.fit(X_train, y_train)
rf_score = roc_auc_score(y_test, rf_gs.best_estimator_.predict_proba(X_test)[:, 1])

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 12.5min finished


In [9]:
param = {'learning_rate' : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'n_estimators' : [5, 10, 25, 50, 100, 150, 250, 500]
        }
gb_gs = GridSearchCV(GradientBoostingClassifier(random_state=123), param_grid=param, cv=5, scoring='roc_auc', verbose=True, n_jobs=-1)
gb_gs.fit(X_train, y_train)
gb_score = roc_auc_score(y_test, gb_gs.best_estimator_.predict_proba(X_test)[:, 1])

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.0min finished


In [10]:
cb = CatBoostClassifier(n_estimators=20, silent=True)
cb.fit(X_train, y_train)
cb_score = roc_auc_score(y_test, cb.predict_proba(X_test)[:,1])

In [11]:
pd.DataFrame({"model" : ["LinearRegression", "DecisionTreeClassifier", "RandomForestClassifier",  "GradientBoostingClassifier",  "CatBoostClassifier"], 
              "score" : [linreg_score, dt_score, rf_score, gb_score, cb_score]})

Unnamed: 0,model,score
0,LinearRegression,0.724296
1,DecisionTreeClassifier,0.733775
2,RandomForestClassifier,0.761004
3,GradientBoostingClassifier,0.76189
4,CatBoostClassifier,0.754539


Writing answers

In [12]:
cb.fit(train_df.drop(columns=['Answer', 'Question']), train_df['Answer'])
answ = pd.read_csv('baseline_solution.csv').set_index('v')
answ['a'] = cb.predict_proba(test_df.drop(columns="Question"))[:,1]
answ.to_csv('result_catboost.csv', header=False)

In [13]:
gb_gs.best_estimator_.fit(train_df.drop(columns=['Answer', 'Question']), train_df['Answer'])
answ = pd.read_csv('baseline_solution.csv').set_index('v')
answ['a'] = gb_gs.best_estimator_.predict_proba(test_df.drop(columns="Question"))[:,1]
answ.to_csv('result_sklearn.csv', header=False)