Будем решать поставленную задачу следующим образом: реши отдельно задачу прогнозирования части речи и задачу определения начальной формы.


Определение начальной формы: решим эту задачу, не прибегая к машинному обучению. Для каждого слова из тестовой выборки вычислим расстояние Левенштейна до каждой из начальных форм в тренировочной выборке. Начальная форма, расстояние Левенштейно до которой минимально, будем считать начальной формой данного слова. Оптимизируем вычисления: из тренировочной выборки можно заключить, что формы та или иная часть речи образуется из исходной формы добавлением суффиксов, а значит, можно проверять только те начальные формы, которые начиаются на первую букву исследуемого слова.

Прогнозирование части речи: применим алгоритм контеста 1 для данной задачи, заменив [A, N, V] $\to$ [0, 1, 2].

In [6]:
import sklearn.linear_model as lm
import sklearn.metrics as mt
import numpy as np
import pandas as pd
import locale 
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split, cross_val_predict


In [7]:
train = pd.read_csv('task2_lemmas_train.txt', sep=',', names=['id', 'word', 'zero_form', 'type1','y2', 'type2'])
st = train['zero_form']
train = train.drop(['type1', 'type2', 'y2', 'id'], axis = 1)
y = []
res = []
for i in range(0, len(st)):
    s = st[i]
    y.append(s[0:len(s) - 2])
    res.append(s[len(s) - 1])
train['type'] = res
train['zero_form'] = y
train = train.drop([0], axis=0)
train.head(5)

Unnamed: 0,word,zero_form,type
1,vergognerete,vergognare,V
2,amnistiavate,amnistiare,V
3,menomazione,menomazione,N
4,sfaldavamo,sfaldare,V
5,sfodererei,sfoderare,V


In [5]:
def lev(s, t):
    if s == t: return 0
    elif len(s) == 0: return len(t)
    elif len(t) == 0: return len(s)
    v0 = [None] * (len(t) + 1)
    v1 = [None] * (len(t) + 1)
    for i in range(len(v0)):
        v0[i] = i
    for i in range(len(s)):
        v1[0] = i + 1
        for j in range(len(t)):
            cost = 0 if s[i] == t[j] else 1
            v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
        for j in range(len(v0)):
            v0[j] = v1[j]
    return v1[len(t)]

In [14]:
test = pd.read_csv('task2_lemmas_test.txt')
example = pd.read_csv('task2_lemmas_sample_submission.txt')
test = test.drop(['Id'], axis = 1)
test.head(5)

Unnamed: 0,X
0,gettonan
1,incidentali
2,involtino
3,lievi
4,comunistizzasse


In [6]:
m = 999999
t = 0
source = ""
ans = []
typ = []

In [140]:
for i in test['X']:
    for j in train['zero_form']:
        if i[0] != j[0]:
            continue
        else:
            r = lev(i, j)
            if r < m:
                m = r
                source = j        
    ans.append(source)
    m = 999999
    source = ""

In [141]:
len(ans)

29661

In [154]:
for i in ans:
    typ.append(i + '+V')
ans.insert(0, [0, 0])


In [157]:
typ.insert(0, 0)
typ

[0,
 'gettonare+V',
 'incidentale+V',
 'involutivo+V',
 'lieve+V',
 'comunistizzare+V',
 'vidimare+V',
 'imbrodare+V',
 'strillare+V',
 'cisti+V',
 'compassato+V',
 'curia+V',
 'snobbare+V',
 'tessere+V',
 'congelare+V',
 'somatizzare+V',
 'impoverire+V',
 'smungere+V',
 'abbuffare+V',
 'meravigliare+V',
 'risucchiare+V',
 'sesquipedale+V',
 'timido+V',
 'nauseabondo+V',
 'ingozzare+V',
 'settimino+V',
 'relazionare+V',
 'sorridere+V',
 'illuminare+V',
 'concitare+V',
 'dissecare+V',
 'deregolamentare+V',
 'elettrizzare+V',
 'ripiovere+V',
 'ottimizzare+V',
 'accordare+V',
 'barbino+V',
 'arcaico+V',
 'sfuggire+V',
 'ritorcere+V',
 'rifiatare+V',
 'sgobbare+V',
 'adoprare+V',
 'abbigliare+V',
 'complimentare+V',
 'plurale+V',
 'soprassedere+V',
 'affermare+V',
 'frusciare+V',
 'amareggiare+V',
 'autoalimentare+V',
 'escutere+V',
 'secco+V',
 'talentare+V',
 'derapare+V',
 'distinguere+V',
 'frignare+V',
 'sincopare+V',
 'tentacolare+V',
 'scomponibile+V',
 'microfilmare+V',
 'disinnest

In [160]:
ans = typ
ans

[0,
 'gettonare+V',
 'incidentale+V',
 'involutivo+V',
 'lieve+V',
 'comunistizzare+V',
 'vidimare+V',
 'imbrodare+V',
 'strillare+V',
 'cisti+V',
 'compassato+V',
 'curia+V',
 'snobbare+V',
 'tessere+V',
 'congelare+V',
 'somatizzare+V',
 'impoverire+V',
 'smungere+V',
 'abbuffare+V',
 'meravigliare+V',
 'risucchiare+V',
 'sesquipedale+V',
 'timido+V',
 'nauseabondo+V',
 'ingozzare+V',
 'settimino+V',
 'relazionare+V',
 'sorridere+V',
 'illuminare+V',
 'concitare+V',
 'dissecare+V',
 'deregolamentare+V',
 'elettrizzare+V',
 'ripiovere+V',
 'ottimizzare+V',
 'accordare+V',
 'barbino+V',
 'arcaico+V',
 'sfuggire+V',
 'ritorcere+V',
 'rifiatare+V',
 'sgobbare+V',
 'adoprare+V',
 'abbigliare+V',
 'complimentare+V',
 'plurale+V',
 'soprassedere+V',
 'affermare+V',
 'frusciare+V',
 'amareggiare+V',
 'autoalimentare+V',
 'escutere+V',
 'secco+V',
 'talentare+V',
 'derapare+V',
 'distinguere+V',
 'frignare+V',
 'sincopare+V',
 'tentacolare+V',
 'scomponibile+V',
 'microfilmare+V',
 'disinnest

In [7]:
a = pd.DataFrame()
a['ans'] = ans
a.to_csv("zero_forms2.txt", sep=',')
#считает ну ООООООЧЕНЬ долго, поэтому не будем рисковать и запишем все в файл сразу

NameError: name 'ans' is not defined

In [17]:
a = pd.read_csv("zero_forms2.txt", sep=',')
a.head(5)

Unnamed: 0.1,Unnamed: 0,ans
0,0,0
1,0,gettonare
2,1,incidentale
3,2,involutivo
4,3,lieve


In [18]:
train.head(5)

Unnamed: 0,word,zero_form,type
1,vergognerete,vergognare,V
2,amnistiavate,amnistiare,V
3,menomazione,menomazione,N
4,sfaldavamo,sfaldare,V
5,sfodererei,sfoderare,V


In [19]:
import sklearn.linear_model as lm
import sklearn.metrics as mt
import numpy as np
import pandas as pd
import locale 
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split, cross_val_predict


In [20]:
linear_train = train.drop(['zero_form'], axis = 1)
linear_train = linear_train.replace(['A', 'N', 'V'], [0, 1, 2])
linear_train.head(5)

Unnamed: 0,word,type
1,vergognerete,2
2,amnistiavate,2
3,menomazione,1
4,sfaldavamo,2
5,sfodererei,2


In [21]:
count = CountVectorizer(analyzer='char_wb', ngram_range=(2, 8),lowercase = False)
sparse_feature_matrix = count.fit_transform(linear_train['word'])
sparse_feature_matrix

<118640x491873 sparse matrix of type '<type 'numpy.int64'>'
	with 6978275 stored elements in Compressed Sparse Row format>

In [11]:
algo = LogisticRegression(penalty='l2', random_state=42, max_iter=1000, n_jobs=-1, tol=1e-6)
algo.fit(sparse_feature_matrix, linear_train['type'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [22]:
mat = count.transform(test['X'])
y = algo.predict(mat)
y = np.insert(y, 0, 0)
y

array([0, 2, 0, ..., 2, 2, 2])

In [23]:
a['Category'] = y
a.head(5)

Unnamed: 0.1,Unnamed: 0,ans,Category
0,0,0,0
1,0,gettonare,2
2,1,incidentale,0
3,2,involutivo,2
4,3,lieve,1


In [24]:
a['Category'] = a['Category'].replace([0, 1, 2], ['A', 'N', 'V'])
a = a.drop(['Unnamed: 0'], axis = 1)
a = a.drop([0], axis = 0)
Category = a['ans'].values
Category

array(['gettonare', 'incidentale', 'involutivo', ..., 'spazzolare',
       'stuzzicante', 'impiagare'], dtype=object)

In [25]:
cat = a['Category'].values
for i in range(0, len(a)):
    c = ''
    if cat[i] == 'V':
        c = 'V'
    if cat[i] == 'N':
        c = 'N'
    if cat[i] == 'A':
        c = 'A'
    Category[i] = Category[i] + '+' + c
Category = np.insert(Category, 0, 0)
Category


array([0, 'gettonare+V', 'incidentale+A', ..., 'spazzolare+V',
       'stuzzicante+V', 'impiagare+V'], dtype=object)

In [26]:
RES = pd.DataFrame()
RES['Category'] = Category
RES.to_csv("zero_forms4.txt", sep=',')