In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [5]:
train = pd.read_csv('linear_train.txt', header=None, names=['word', 'target']).dropna()
train.head()
print(len(train))

101408


In [6]:
test = pd.read_csv('linear_test.txt', header=None, names=['word']).dropna()
print(len(test))
test.head()

188920


Unnamed: 0,word
0,Аалто
1,ААР
2,Аара
3,Ааре
4,Аарон


In [7]:
X = train.iloc[0:, 0]
Y = train.iloc[0:, 1]

Нам нужно извлечь признаки из наших данных

In [8]:
capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
           'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
vowels = ['А','Е','Ё','И','О','У','Ы','Э','Ю','Я',
              'а','е','ё','и','о','у','ы','э','ю','я']
consonants = ['Б','В','Г','Д','Ж','З','Й','К','Л','М','Н','П','Р','С','Т','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ь',
                  'б','в','г','д','ж','з','й','к','л','м','н','п','р','с','т','ф','х','ц','ч','ш','щ','ъ','ь',]

Посмотрим, сколько фамилий начинается с большой буквы

In [9]:
surnames_num = 0
count = 0
for i in range(len(train)):
    if Y[i] == 1:
        surnames_num += 1    
        if X[i][0] in capitals and X[i][1:2] not in capitals:
            count += 1
print(count, surnames_num)

8742 10638


Делаем вывод, что это неплохой признак, он коррелирует с ответами, добавим его к нашим данным

In [10]:
train.head()

Unnamed: 0,word,target,is_capitalize
0,Аалтонен,1,1
1,Аар,0,1
2,Аарон,0,1
3,ААРОН,0,0
4,Аарона,0,1


Добавим количество букв, отдельно количество гласных и согласных

In [20]:
def featurize(train):
    train['is_capitalize'] = train['word'].apply(lambda x: 1 if x[0] in capitals and x[1:2] not in capitals else 0)
    train['length'] = train['word'].apply(lambda x: len(x))
    train['vowels'] = train['word'].apply(lambda x: len([i for i in x if i in vowels]))
    train['consonants'] = train['word'].apply(lambda x: len([i for i in x if i in consonants]))
featurize(train)
train.head()

Unnamed: 0,word,target,is_capitalize,length,vowels,consonants
0,Аалтонен,1,1,8,4,4
1,Аар,0,1,3,2,1
2,Аарон,0,1,5,3,2
3,ААРОН,0,0,5,3,2
4,Аарона,0,1,6,4,2


Пробуем запустить логистическую регрессию на наших данных

In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(train.drop(['word', 'target'], axis=1), train['target'])
predictor = LogisticRegression(penalty='l1')

In [13]:
predictor.fit(xtrain, ytrain)
predictions = predictor.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.81236258535671646

In [14]:
print(cross_val_score(predictor, train.drop(['word', 'target'], axis=1), train['target'], scoring='roc_auc'))

[ 0.81142882  0.80462321  0.79489511]


Работает очень хорошо...
Попробуем добавить окончания слов(not)

In [15]:
train.head()

Unnamed: 0,word,target,is_capitalize,length,vowels,consonants
0,Аалтонен,1,1,8,4,4
1,Аар,0,1,3,2,1
2,Аарон,0,1,5,3,2
3,ААРОН,0,0,5,3,2
4,Аарона,0,1,6,4,2


In [16]:
cols = train.columns.drop(['word', 'target'])

In [17]:
predictor = LogisticRegression().fit(train[cols], train['target'])

In [23]:
featurize(test)

In [24]:
predictons = predictor.predict_proba(test[cols])[:,1]

In [29]:
pd.DataFrame(data=predictons, columns=['Answer']).to_csv('my_super_submission.csv', index=True, index_label='Id')

In [11]:
suffix3 = []
for i in range(len(X)):
    if (Y[i] == 1):
        suffix3.append(X[i][-3:])
suffix3 = np.unique(suffix3)
print(len(suffix3))

2425


In [12]:
%%time
for suf in suffix3:
    train[suf] = train['word'].apply(lambda x: 1 if x[-3:]==suf else 0)

In [44]:
train.head()

Unnamed: 0,word,target,is_capitalize,length,vowels,consonants,-то,АДО,АЙН,АЙС,...,яны,янь,яню,яра,ясо,яцу,ёва,ёве,ёма,ёша
0,Аалтонен,1,1,8,4,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Аар,0,1,3,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Аарон,0,1,5,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ААРОН,0,0,5,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Аарона,0,1,6,4,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Попробуем обучить модель на этих данных

In [45]:
cols = train.columns.drop(['word', 'target'])
xtrain, xtest = train_test_split(train)

In [46]:
predictor = LogisticRegression(penalty='l1').fit(xtrain[cols], xtrain['target'])
predictions = predictor.predict_proba(xtest[cols])

In [47]:
roc_auc_score(xtest['target'], predictions[:,1])

0.86384611231819064

Получается неплохой score. Пробуем делать submission, нужно сделать такие же преобразования с тестовыми данными

In [13]:
cols = train.columns.drop(['word', 'target'])
test['is_capitalize'] = test['word'].apply(lambda x: 1 if x[0] in capitals and x[1:2] not in capitals else 0)
test['length'] = test['word'].apply(lambda x: len(x))
test['vowels'] = test['word'].apply(lambda x: len([i for i in x if i in vowels]))
test['consonants'] = test['word'].apply(lambda x: len([i for i in x if i in consonants]))

Работает очень медленно, у меня так и не досчитался(( 

In [None]:
%%time
for suf in suffix3:
    test[suf] = test['word'].apply(lambda x: 1 if x[-3:]==suf else 0)

In [None]:
model = LogisticRegression(penalty='l1').fit(train[cols], train['target'])
predictions = predictor.predict_proba(test.drop['word'])

In [1]:
predictions = predictions[:,1]
predictions.to_csv("submission.tsv", sep=',', index=False)

NameError: name 'predictions' is not defined

Попробуем воспользоваться стандартными библиотеками, признаками будут n-граммы, используем биграммы, триграммы и так до 8

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
vectorizer = CountVectorizer(ngram_range=(2,8), analyzer='char_wb', lowercase=False, max_df=0.84, binary=True)

In [36]:
train.head()

Unnamed: 0,word,target,is_capitalize,length,vowels,consonants
0,Аалтонен,1,1,8,4,4
1,Аар,0,1,3,2,1
2,Аарон,0,1,5,3,2
3,ААРОН,0,0,5,3,2
4,Аарона,0,1,6,4,2


In [39]:
%%time
train_matrix = vectorizer.fit_transform(train['word'])

CPU times: user 13.1 s, sys: 255 ms, total: 13.3 s
Wall time: 13.3 s


In [42]:
train_matrix

<101408x913714 sparse matrix of type '<class 'numpy.int64'>'
	with 4577175 stored elements in Compressed Sparse Row format>

In [45]:
model = LogisticRegression(penalty='l1', n_jobs=-1, random_state=45)
model.fit(train_matrix, train['target'])
predictions = predict_proba(vectorizer.transform(test['word']))

In [46]:
pd.DataFrame(data=predictions[:,1], columns=['Answer']).to_csv('my_super_second_submission.csv', index=True, index_label='Id')