In [1771]:
# import libraries
import pandas as pd
import numpy as np

from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype

from functools import reduce

from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# set pd options for comfort display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2015]:
# import train and test datasets
train = pd.read_csv('data/train_dataset_train.csv')
test = pd.read_csv('data/test_dataset_test.csv')

# mark each dataset before merge
train['train'] = 1
test['train'] = 0

# merge datasets (for correct n/a handling)
data=pd.concat([train, test])

In [2016]:
data[data['train'] == 0].shape

(262, 3)

In [2017]:
# calls data
calls = pd.read_csv('data/prepared/grouped_calls.csv')
# connection data
connection = pd.read_csv('data/prepared/grouped_connection.csv')
# education data
education = pd.read_csv('data/prepared/education.csv')
# network data
network = pd.read_csv('data/prepared/grouped_network.csv')
# skud data
skud = pd.read_csv('data/prepared/grouped_skud.csv')
# tasks data
tasks = pd.read_csv('data/prepared/grouped_tasks.csv')
# work data
work = pd.read_csv('data/prepared/grouped_work.csv')

In [2018]:
# list of dataframes to merge
dataframes = [data, calls, connection, education, network, skud, tasks, work]
# merging dataframes with using reduce function
data = reduce(lambda left, right: pd.merge(left, right, on=['id'],
                                            how='left'), dataframes)

In [2019]:
# get feature about organization from id (1 and 2)
data['org'] = data['id'].apply(lambda x: int(x[3]))
data.drop('id', axis=1, inplace=True)

In [2020]:
# drop categorial data
data = data.drop(['Вид образования', 'Специальность'], axis=1)

In [2021]:
# fillna with 0 in exists cols
for col in ['exists_in_calls', 'exists_in_connection', 'exists_in_education',
      'exists_in_network', 'exists_in_skud', 'exists_in_tasks', 'exists_in_work']:
    data[col] = data[col].fillna(0)

In [2022]:
for col in data.columns:
        data[col] = data[col].fillna(data[col].mean())

In [2023]:
data_model = data.copy()

In [2024]:
# X for train part
X = data_model[data_model.train == 1].drop('type', axis=1)
# y for train part
y = data_model[data_model.train == 1][["type"]].values.ravel()

In [2025]:
# Pipeline 
# StandardScaler
scaler = StandardScaler()
# principal component analysis
pca = PCA(n_components=80)
#oversampler
smote = SMOTE(k_neighbors=6)
# support vector C-Support Vector Classification
clf = SVC(C=0.01, kernel='linear')

#pipeline
pipe = Pipeline([
    ('scaler', scaler), 
    ('pca', pca),
    ('smote', smote),
    ('clf', clf)
])

In [2026]:
#GridSearchCV hyperparameters tuning

#param_grid = {'smote__k_neighbors': [4,5,6,7],
#              'clf__C': [0.001, 0.01, 0.1, 1, 10],
#            }
#
#cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=24)
#
#gridsearch = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='recall_macro', n_jobs=-1, verbose=2)

In [2027]:
#gridsearch.fit(X, y)
#print(gridsearch.best_params_)

In [2029]:
#train test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.25, random_state=24)
# fit pipeline
pipe.fit(X_train, y_train)
# predict on validation data
pred = pipe.predict(X_valid)
# print classification report
print(classification_report(y_valid, pred))

              precision    recall  f1-score   support

           0       0.51      0.55      0.53        38
           1       0.33      0.68      0.45        19
           2       0.64      0.39      0.48        75
           3       0.23      0.50      0.32         6

    accuracy                           0.48       138
   macro avg       0.43      0.53      0.44       138
weighted avg       0.55      0.48      0.48       138



In [1995]:
# preparing X_test for prediction
X_sub = data_model[data.train == 0].drop('type', axis=1)
# predict test for submit
predicted = pipe.predict(X_sub)
#reimport test df
test = pd.read_csv('data/test_dataset_test.csv')
# set prediction to df
test['type'] = predicted
# remove train column befora save

In [1996]:
# write baseline for submission
test.to_csv('exp111.csv', index=False)

In [1999]:
pd.read_csv('exp111.csv')['type'].value_counts()

2    92
0    77
1    73
3    20
Name: type, dtype: int64