In [1]:
import pandas as pd
import ruts

from ruts import BasicStats
from ruts import ReadabilityStats
from ruts import DiversityStats
from ruts import MorphStats

import re
import tqdm

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score



In [2]:
def get_metrics_dataset(df):

    from ruts import BasicStats
    from ruts import ReadabilityStats
    from ruts import DiversityStats
    from ruts import MorphStats
    from tqdm import tqdm
    import pandas as pd

    books = []
    authors = []
    foreign = []

    df_new = pd.DataFrame()
    d = {}

    for part_id in tqdm(range(len(df['text']))):
        chapter_text = df['text'][part_id].strip().replace('\n', '')

        try:
            
            # добавляем базовые статистики
            bs = BasicStats(chapter_text)
            bs_data = dict(bs.get_stats())
            del bs_data['c_letters']
            del bs_data['c_syllables']


            # метрики читаемости
            rs = ReadabilityStats(chapter_text)
            rs_data = dict(rs.get_stats())


            # морфологические метрики
            ms = MorphStats(chapter_text)
            ms_data = dict(ms.get_stats())


            # метрики лексического разнообразия
            ds = DiversityStats(chapter_text)
            ds_data = dict(ds.get_stats())
            

            d[str(part_id) + ' ' + df['author'][part_id] + ' ' + df['work_title'][part_id]] = {**bs_data, **rs_data, 
                                                                                               **ms_data['gender'], **ms_data['number'], 
                                                                                               **ms_data['tense'], **ms_data['voice'], 
                                                                                               **ms_data['person'], **ds_data}
            df_new = df_new.append(d[str(part_id) + ' ' + df['author'][part_id] + ' ' + df['work_title'][part_id]], ignore_index=True)

            books.append(str(part_id) + ' ' + df['author'][part_id] + ' ' + df['work_title'][part_id])
            authors.append(df['author'][part_id])
            
            if re.sub("[^a-zA-Z]+", "", chapter_text) != '':
                foreign.append(1)
            else :
                foreign.append(0)
                
        except:
            pass
        
    df_new.index = books
    df_new['author'] = authors
    df_new['foreign'] = foreign
        
    return df_new

In [3]:
def prepare_metric_dataset(data, final_features):
    df = data.copy()
    for column in ['neut', 'masc', 'femn', 'plur','sing', 'pres', 'past', 
                 'futr', 'pssv', 'actv', '2per','3per', '1per']:
        df[column] = df[column] / df['n_words'] # делаем проценты вместо чисел для некоторых показателей
    df.index = df['Unnamed: 0'] # переопределяем индекс
    return df[final_features]

In [4]:
def get_variables(df_train, df_test):
    df_train1 = df_train.copy()
    df_test1 = df_test.copy()

    X_train = df_train1.drop(['author'], axis=1)
    y_train = df_train1['author']
    X_test = df_test1.drop(['author'], axis=1)
    y_test = df_test1['author']
    X_train.fillna(value=0, inplace=True)
    X_test.fillna(value=0, inplace=True)
#    X_train = X_train.set_index('Unnamed: 0')
#    X_test = X_test.set_index('Unnamed: 0')
    return X_train, y_train, X_test, y_test


In [5]:
metrics_train = pd.read_csv('metrics_raw_train_foreign250.csv')
metrics_test = pd.read_csv('metrics_raw_test_foreign250.csv')

final_features = ['smog_index', 'masc', 'neut', 'femn', 'sing', 'plur', 'past', 'futr',
       'pres', 'actv', '1per', '2per', '3per', 'mttr', 'hdd', 'pssv',
       'foreign', 'author']

cm_train = prepare_metric_dataset(metrics_train, final_features)
cm_test = prepare_metric_dataset(metrics_test, final_features)

In [6]:
#metrics_train.to_csv('metrics_raw_train_foreign250.csv')
#metrics_test.to_csv('metrics_raw_test_foreign250.csv')

In [7]:
cm_train.columns == cm_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [13]:
X_train.columns

Index(['smog_index', 'masc', 'neut', 'femn', 'sing', 'plur', 'past', 'futr',
       'pres', 'actv', '1per', '2per', '3per', 'mttr', 'hdd', 'pssv',
       'foreign'],
      dtype='object')

In [8]:
X_train, y_train, X_test, y_test =  get_variables(cm_train, cm_test)

X_train2 = X_train.copy()
X_test2 = X_test.copy()
X_train2.drop(columns='foreign', inplace = True)
X_test2.drop(columns='foreign', inplace = True)

In [9]:
model = CatBoostClassifier()
model.fit(X_train, y_train, silent=True)
preds = model.predict(X_test)
print('F1-score: ', f1_score(y_test, preds, average='weighted'))

F1-score:  0.3617921492444837


In [10]:
model = CatBoostClassifier()
model.fit(X_train2, y_train, silent=True)
preds = model.predict(X_test2)
print('F1-score: ', f1_score(y_test, preds, average='weighted'))

F1-score:  0.34887386021923117


In [11]:
#multi_class='multinomial', max_iter=10000, C=121
logreg_model = LogisticRegression(multi_class='multinomial', max_iter=10000, C=121) # уже подобранные гиперпараметры
logreg_model.fit(X_train, y_train)
pred_logreg = logreg_model.predict(X_test)
print('F1-score: ', f1_score(y_test, pred_logreg, average='weighted'))

F1-score:  0.35552174283740146


In [12]:
logreg_model = LogisticRegression() # уже подобранные гиперпараметры
logreg_model.fit(X_train2, y_train)
pred_logreg = logreg_model.predict(X_test2)
print('F1-score: ', f1_score(y_test, pred_logreg, average='weighted'))

F1-score:  0.2561324008402221
