In [None]:
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
def model_lr_gcv(X, y, c):
    start = time.time()
    scorer = {'log_loss': metrics.make_scorer(log_loss, needs_proba = True),
              'roc_auc': metrics.make_scorer(roc_auc_score)}
    kf = KFold(n_splits = 5, shuffle = True)
    parameters = {'C': [c], 'solver': ['sag'], 'max_iter':[300]}
    gcv = GridSearchCV(estimator = LogisticRegression(), param_grid = parameters, 
                       scoring = scorer, refit = 'roc_auc', cv = kf, n_jobs=2)
    
    gcv.fit(X, y)
    end = time.time()
    print('Time %d:%d:%f' %((end - start)//3600,(end - start)//60,(end - start) % 60))
    return gcv

def tfidf(X_train, X_test, max_features):
    tfidf_word = TfidfVectorizer(
        analyzer='word',
        max_df = 0.1, 
        ngram_range=(1, 2),
        max_features=max_features)
    tfidf_X = tfidf_word.fit_transform(X['comment_text'])
    tfidf_test_X = tfidf_word.transform(test_X['comment_text'])
    return tfidf_X, tfidf_test_X, tfidf_word
    
    
def save_result(out, result, types):
    data = pd.DataFrame(index=C, columns=['ROC-AUC', 'Log loss'])
    for c, d in result.items():
        ra = ''
        ll = ''
        for t in types:
            ra += t + ', ' + str(d[(1, t)]) + '\n'
            ll += t + ', ' + str(d[(0, t)]) + '\n'
        ra += 'average, ' + str(d[1]) + '\n'
        ll += 'average, ' + str(d[0]) + '\n'
        data.loc[c,'ROC-AUC'] = ra
        data.loc[c,'Log loss'] = ll
    data.to_csv('%s.csv' % out)
    
def save_plt(res_cv, res_test, name_metric, n, m, t):
    plt.clf()
    plt.plot([i for i in range(n, m)], res_cv,  'go-', 
         label='line 1', linewidth = 2, color = 'blue')
    plt.plot([i for i in range(n, m)], res_test,  'go-', 
             label='line 1', linewidth = 2, color = 'orange')
    plt.ylim(0, 1)
    plt.ylabel('Оценка качества')
    plt.xlabel('C=10^x')
    plt.title(name_metric)
    plt.savefig('%s.png' % (name_metric+'-'+t))
    
def get_res(id_metric, t, scores):
    return [sc.get((id_metric, t)) for sc in scores.values()] 

In [None]:
train = pd.read_csv('processed_train.csv', index_col='id')
test_X = pd.read_csv('processed_test.csv', index_col='id')
test_y = pd.read_csv('test_labels.csv', index_col='id')
types = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
X = train[['comment_text']]
test_X =  test_X[['comment_text']]
y = train[types]

In [None]:
n, m = -1, 3
C = [10**i for i in range(n, m)]

In [None]:
max_features = 40000
train_data, test_data, model_tfidf = tfidf(X, test_X, max_features)

In [None]:
#0 - log_loss, 1 - roc_auc
cv_score = {}
test_score = {}
lr = None
for i in C:
    print('C = %f:' % i)
    cv_score[i] = {}
    test_score[i] = {}
    for t in types:
        print("Type %s: " %t)
        testY = np.array(test_y[t])
        lr = model_lr_gcv(train_data, y[t], i)
        pred_y = np.array(lr.predict(test_data))
        pred_proba = lr.predict_proba(test_data)
        pred_proba_y = []
        for x1, x2 in pred_proba:
             pred_proba_y.append(x2)
        cv_score[i][(0, t)] = lr.cv_results_['mean_test_log_loss'][0]
        cv_score[i][(1, t)] = lr.cv_results_['mean_test_roc_auc'][0]
        test_score[i][(0, t)] = log_loss(testY, pred_proba_y),
        test_score[i][(1, t)] = roc_auc_score(y_true=testY, y_score=pred_proba_y)

In [None]:
for t in types:
    save_plt(get_res(0, t, cv_score),  get_res(0, t, test_score),"log_loss", n, m, t)
    save_plt(get_res(1, t, cv_score),  get_res(1, t, test_score),"ROC_AUC", n, m, t)

In [None]:
for i in C:
    cv_score[i][0] = 0
    cv_score[i][1] = 0
    test_score[i][0] = 0
    test_score[i][1] = 0
    for t in types:
        cv_score[i][0] += cv_score[i][(0, t)]
        cv_score[i][1] += cv_score[i][(1, t)]
        test_score[i][0] += test_score[i][(0, t)][0]
        test_score[i][1] += test_score[i][(1, t)]
    cv_score[i][0] /= len(types)
    cv_score[i][1] /= len(types)
    test_score[i][0] /= len(types)
    test_score[i][1] /= len(types)
    

In [None]:
save_result("cv_score3", cv_score, types)
save_result("test_score3", test_score, types)

In [None]:
np.save('train_data', train_data)
np.save('test_data', test_data)