In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

import os

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

from sklearn.metrics import log_loss, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

In [2]:
path = 'C:\\Users\\sunil\\Projects\\Machine Hack\\Merchandise Popularity Prediction\\Dataset'

train = pd.read_csv(path + '\\Train.csv')
test = pd.read_csv(path + '\\Test.csv')
sample_sub = pd.read_csv(path + '\\sample_submission.csv')

In [4]:
target = 'popularity'
features = [col for col in train.columns if col not in [target]]

In [5]:
trn, val = train_test_split(train, test_size=0.2, random_state = 1, stratify = train[target])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[target], val[target]

##### Features for the test data that we will be predicting
X_test = test[features]

In [6]:
clf = ExtraTreesClassifier(random_state = 1,max_depth = 35, n_estimators = 2000)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict_proba(X_val)

log_loss(y_val, preds_val)

0.3436662428845033

In [7]:
clf = XGBClassifier(random_state = 1)
_ = clf.fit(X_trn, y_trn)

preds_val = clf.predict_proba(X_val)

log_loss(y_val, preds_val)

0.3983739765664552

In [7]:
{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}

{'linear', 'poly', 'precomputed', 'rbf', 'sigmoid'}

In [35]:
preds = clf.predict_proba(X_test)

In [36]:
sample = pd.DataFrame(preds)
sample.to_csv(path+'\\xgb_baseline.csv', index=False)

In [37]:
def cross_val(regressor, train, test, features):
    N_splits = 5
    
    oofs = np.zeros(shape=(len(train), 5))
    preds = np.zeros(shape=(len(test), 5))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    ""
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        #### Scaling Data ####
        #scaler = StandardScaler()
        #_ = scaler.fit(X_trn)
        #
        #X_trn = scaler.transform(X_trn)
        #X_val = scaler.transform(X_val)
        #X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ############ Predicting #############
        val_preds = regressor.predict_proba(X_val)
        test_preds = regressor.predict_proba(X_test)
        
        error = log_loss(y_val, val_preds)
        print(f'\n Logloss for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = log_loss(target_col, oofs)
    print(f'\n\Logloss for oofs is {total_error}')
    
    return oofs, preds

In [10]:
%%time
lr_oofs, lr_preds = cross_val(clf, trn, test, features)



 Logloss for Validation set is : 0.43849472942407924


 Logloss for Validation set is : 0.44175756392152277


 Logloss for Validation set is : 0.44294865367705233


 Logloss for Validation set is : 0.460012143727871


 Logloss for Validation set is : 0.4419423854521928

\Logloss for oofs is 0.44503064649924085
Wall time: 1min 58s


---
# Model Building

In [18]:
target = 'popularity'

In [19]:
t0 = train.copy()
t1 = train.copy()
t3 = train.copy()
t4 = train.copy()
t5 = train.copy()

In [20]:
t0[target] = t0[target].apply(lambda x: 1 if x==0 else 0)
t1[target] = t1[target].apply(lambda x: 1 if x==1 else 0)
t3[target] = t3[target].apply(lambda x: 1 if x==3 else 0)
t4[target] = t4[target].apply(lambda x: 1 if x==4 else 0)
t5[target] = t5[target].apply(lambda x: 1 if x==5 else 0)

In [21]:
t0[target].value_counts()

0    18192
1       16
Name: popularity, dtype: int64

In [22]:
df_ls = [t0, t1, t3, t4, t5]

In [23]:
df_name_ls = ['0', '1', '2','3', '4']

In [24]:
target_proba_df = pd.DataFrame()
test_proba_df = pd.DataFrame()

train_probas = pd.DataFrame()

In [25]:
total_loss = 0
for name, df in zip(df_name_ls,df_ls):
    trn, val = train_test_split(df, test_size=0.2, random_state = 1, stratify = df[target])

    ###### Input to our model will be the features
    X_trn, X_val = trn[features], val[features]

    ###### Output of our model will be the TARGET_COL
    y_trn, y_val = trn[target], val[target]

    ##### Features for the test data that we will be predicting
    X_test = test[features]
    
    ## Oversampling Using SMOTE
    #sm = SMOTE(random_state=1)
    #X_trn_os, y_trn_os = sm.fit_sample(X_trn, y_trn.ravel())
    
    ## Training and Predicting
    clf = XGBClassifier(random_state=1)
    _ = clf.fit(X_trn, y_trn)
    val_preds = clf.predict_proba(X_val)[:,1]
    loss = log_loss(y_val, val_preds)
    total_loss+=loss
    print(f'log loss for t{name} is {loss}')
    
    # Predicting on full train
    trn_preds = clf.predict_proba(df[features])[:, 1]
    print(f'loss on full train is {log_loss(df[target], trn_preds)}')
    target_proba_df[name] = val_preds
    train_probas[int(name)] = trn_preds
    
    test_preds = clf.predict_proba(X_test)[:,1]
    test_proba_df[int(name)] = test_preds
    print()
print(f'Total loss is {total_loss}')

log loss for t0 is 0.00010766464530425834
loss on full train is 0.00011535138527560551

log loss for t1 is 0.06863565437490646
loss on full train is 0.020560055336419258

log loss for t2 is 0.29073961084829725
loss on full train is 0.15189471148673692

log loss for t3 is 0.3199145406412955
loss on full train is 0.18098456372179494

log loss for t4 is 0.098201012075115
loss on full train is 0.0318966199152879

Total loss is 0.7775984825849184


In [None]:
train_probas[target] = train_probas.idxmax(axis=1)

In [None]:
features = [0, 1, 2, 3, 4]
trn, val = train_test_split(train_probas, test_size=0.2, random_state = 1, stratify = train_probas[target])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[target], val[target]

##### Features for the test data that we will be predicting
X_test = test_proba_df[features]

In [None]:
clf = XGBClassifier(random_state=1)
clf.fit(X_trn, y_trn)

preds = clf.predict_proba(X_val)
loss = log_loss(y_val, preds)
loss

In [None]:
clf.fit(trn[features], trn[target])
preds = clf.predict_proba(X_test)

sample = pd.DataFrame(preds)
sample.to_csv(path+'\\normal self build model 1st try.csv', index=False)

In [None]:
preds

tasks for getting proba sum = 1:
    1. make cross validation inside for loop.
    2. Take probabilities for whole train set.
    3. train model on train probabilities and take predictions on test probabilities.

In [None]:

for name, df in zip(df_name_ls,df_ls):
    
    oofs = np.zeros(len(df))
    preds = np.zeros(len(test))
    
    folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state=1)
    stratified_target = pd.qcut( df[target], 10, label=False, duplicates='drop')
    
    for index, (trn_idx, val_idx) in enumerate(flods.split(df, stratified_target)):
        X_trn, y_trn = df[features].iloc[trn_idx], df[target].iloc[trn_idx]
        

In [None]:
indexes = test_proba_df.idxmax(axis = 1)