In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier, Pool
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import random
import sklearn

pd.options.display.max_columns = 100
pd.options.display.max_rows = 200
random.seed = 6342
np.random.seed = 4342

%matplotlib inline

In [2]:
bad_cols = []

train = pd.read_csv('data/train.csv').drop(bad_cols, axis=1)
test = pd.read_csv('data/test.csv').drop(bad_cols, axis=1)
sample = pd.read_csv('data/sample_solution.csv')

In [3]:
for df in train, test:
    df['month_id'] = pd.to_datetime(df['month_id'])
    df['carts_created_at'] = pd.to_datetime(df['carts_created_at'])
    df['time_diff'] = (df['month_id'] - df['carts_created_at']).astype("timedelta64[D]")
    df.drop(['month_id', 'carts_created_at'], axis=1, inplace=True)
    df['gender'] = df['gender'].fillna(-1)
    df['gender'] = df['gender'].astype(np.int8)
    
    df['procentage_completed_hw'] = df['completed_hw'] / df['interacted_hw']
    df['procentage_failed_hw'] = df['failed_hw'] / df['interacted_hw']

In [4]:
all_dicts = {}

cat_cols = ['communication_type', 'payment_type', 'promo', 'gender', 'ABC']

for x in cat_cols:
    all_dicts[x] = LabelEncoder().fit(train[x])
    train[x] = all_dicts[x].transform(train[x])
    test[x] = all_dicts[x].transform(test[x])

In [5]:
mass_object = train.dtypes[train.dtypes == "object"].index.values.tolist()

train = train.drop(mass_object, axis=1)
features = train
target = train['target']

test = test.drop(mass_object, axis=1)

In [6]:
for x in range(6):
    new = features[features['target']==x].groupby('program_id')['target'].agg(['count'])
    new.columns = ['program_info_'+str(x)]
    test = test.merge(new, how='left', left_on='program_id', right_index=True)
test['program_info_count'] = test[[f'program_info_{x}' for x in range(6)]].sum(axis=1)

for x in range(6):
    test[f'program_info_{x}'] /= test['program_info_count']

for x in range(6):
    new = features[features['target']==x].groupby('student_id')['target'].agg(['count'])
    new.columns = ['student_info_'+str(x)]
    test = test.merge(new, how='left', left_on='student_id', right_index=True)
test['student_info_count'] = test[[f'student_info_{x}' for x in range(6)]].sum(axis=1)

for x in range(6):
    test[f'student_info_{x}'] /= test['student_info_count']

In [7]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
scores = []
test_preds = np.zeros((len(test), 6))

for fold, (train_index, valid_index) in enumerate(strat_kfold.split(features, target)):
    X_train, y_train = features.iloc[train_index], target[train_index]
    X_valid, y_valid = features.iloc[valid_index], target[valid_index]
    
    for x in range(6):
        new = X_train[X_train['target']==x].groupby('program_id')['target'].agg(['count'])
        new.columns = ['program_info_'+str(x)]
        X_train = X_train.merge(new, how='left', left_on='program_id', right_index=True)
        X_valid = X_valid.merge(new, how='left', left_on='program_id', right_index=True)
        X_train.loc[X_train['target']==x, f'program_info_{x}'] -= 1
    X_train['program_info_count'] = X_train[[f'program_info_{x}' for x in range(6)]].sum(axis=1)
    X_valid['program_info_count'] = X_valid[[f'program_info_{x}' for x in range(6)]].sum(axis=1)
    
    for x in range(6):
        X_train[f'program_info_{x}'] /= X_train['program_info_count']
        X_valid[f'program_info_{x}'] /= X_valid['program_info_count']
        
    X_train[[f'program_info_{x}' for x in range(6)]] = X_train[[f'program_info_{x}' for x in range(6)]].fillna(0)
    
    
    for x in range(6):
        new = X_train[X_train['target']==x].groupby('student_id')['target'].agg(['count'])
        new.columns = ['student_info_'+str(x)]
        X_train = X_train.merge(new, how='left', left_on='student_id', right_index=True)
        X_valid = X_valid.merge(new, how='left', left_on='student_id', right_index=True)
        X_train.loc[X_train['target']==x, f'student_info_{x}'] -= 1
    X_train['student_info_count'] = X_train[[f'student_info_{x}' for x in range(6)]].sum(axis=1)
    X_valid['student_info_count'] = X_valid[[f'student_info_{x}' for x in range(6)]].sum(axis=1)
    
    for x in range(6):
        X_train[f'student_info_{x}'] /= X_train['student_info_count']
        X_valid[f'student_info_{x}'] /= X_valid['student_info_count']
        
    X_train[[f'student_info_{x}' for x in range(6)]] = X_train[[f'student_info_{x}' for x in range(6)]].fillna(0)
    
    
    X_train, X_valid = X_train.drop('target', axis=1), X_valid.drop('target', axis=1)
    

    # model = CatBoostClassifier(
    #     # iterations=1000,
    #     #loss_function='MultiClassOneVsAll',
    #     # learning_rate=0.03,
    #     # depth=4,
    #     random_seed=5443,
    #     # eval_metric='AUC',
    #     # class_weights=[1, 2, 2, 2, 2, 2],
    #     early_stopping_rounds=200
    # )
    # model.fit(X_train, y_train, verbose=100)
    # model.fit(Pool(X_train, y_train, cat_features=cat_cols),
    #           eval_set=Pool(X_valid, y_valid, cat_features=cat_cols),
    #           verbose=100, plot=True)
    # valid_preds = model.predict(X_valid)
    # scores.append(0.2* recall_score(y_valid, valid_preds, average='macro') + 0.8 * precision_score(y_valid, valid_preds, average='macro'))
    
    # test_preds += model.predict_proba(test)
    
    # clf = CatBoostClassifier(random_seed=0)
    # clf.fit(Pool(X_train.fillna(0), y_train), verbose=False)
    # pred = clf.predict(X_valid.fillna(0))
    
    very_bad = ['hw_leader', 'procentage_failed_hw', 'failed_hw', 'spent_time_to_complete_hw']
    
    # clf = CatBoostClassifier(
    #     random_seed=1,
    #     depth=5,
    #     iterations=2000
    # )
    # clf.fit(X_train.fillna(0), y_train, verbose=500)
    # pred = clf.predict(X_valid.fillna(0))
    # feature_imp += clf.get_feature_importance()
    
    clf = RandomForestClassifier(
        random_state=0
    )
    
    clf.fit(X_train.fillna(0).drop(very_bad, axis=1), y_train)
    pred = clf.predict(X_valid.fillna(0).drop(very_bad, axis=1))
    
    scores.append(0.2* recall_score(y_valid, pred, average='macro') + 0.8* precision_score(y_valid, pred, average='macro'))
    print(scores[-1])
    
    test_preds += clf.predict_proba(test.fillna(0).drop(very_bad, axis=1))
    
f'Final_score: {np.mean(scores)}'

0.9061660956083841
0.9036268326062735
0.9085509392769997
0.8949287937557883
0.9117831561834415


'Final_score: 0.9050111634861775'

In [8]:
sample['target'] = test_preds.argmax(axis=1)
sample.to_csv('submit.csv', index=False)

In [9]:
sample['target'].value_counts()

0    74550
1     5684
4     1441
5     1389
3     1278
2      655
Name: target, dtype: int64