In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
%matplotlib inline

In [2]:
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [3]:
X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')
X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')

Y = pd.read_csv(TRAIN_PATH + 'Y.csv')

X1.set_index('id', inplace=True)
Y.set_index('id', inplace=True)

Y.columns = ['Y' + s if s != 'id' else 'id' for s in Y.columns]


X1_test = pd.read_csv(TEST_PATH + 'X1.csv')
X1_test.set_index('id', inplace=True)
X2_test = pd.read_csv(TEST_PATH + 'X2.csv')

In [4]:
def transform_x2(X2, svd=None):
    rows, row_pos = np.unique(X2.iloc[:, 0], return_inverse=True)
    cols, col_pos = np.unique(X2.iloc[:, 1], return_inverse=True)
    sparse_matrix = np.zeros((len(rows), len(cols))) #, dtype=np.float32)

    sparse_matrix[row_pos, col_pos] = 1

    cols_ = sparse_matrix.sum(axis=0)
    rows_ = sparse_matrix.sum(axis=1)

    minimum_users_per_group = 10
    selected_cols = cols_ >= minimum_users_per_group
    trimmed_sparse_matrix = sparse_matrix[:, selected_cols]

    if svd is None:
        svd = TruncatedSVD(n_components=50)
        svd.fit(trimmed_sparse_matrix)

    components = pd.DataFrame(svd.transform(trimmed_sparse_matrix))
    components['id'] = rows
    X2 = components
    return X2, svd


X2_all = X2.append(X2_test)
X2_all, svd = transform_x2(X2_all)

In [5]:
X2 = X2_all[X2_all.id.isin(X1.index)]
X2_test = X2_all[X2_all.id.isin(X1_test.index)]

X2.set_index('id', inplace=True)
X2_test.set_index('id', inplace=True)

In [6]:
X_train = X1.copy()
X_train = X_train.merge(X2, on='id', suffixes=('', '_y')).merge(Y, on='id')

X_test = X1_test.copy().merge(X2_test,  on='id', suffixes=('', '_y'))

id_ = np.array(X_train.index)
id_test = np.array(X_test.index)
y1 = X_train.pop('Y1')
y2 = X_train.pop('Y2')
y3 = X_train.pop('Y3')
y4 = X_train.pop('Y4')
y5 = X_train.pop('Y5')

In [7]:
def modifyX(X, qCount):
    ct = ColumnTransformer([("ohe", OneHotEncoder(sparse=False), ['6_y', '8_y'])])
    for col in ['6', '8']:
        X = X.merge(pd.qcut(X[col], qCount, duplicates='drop', labels=False), on='id')
    X = X.merge(pd.DataFrame(ct.fit_transform(X), index=X.index), on='id')
    
    for col in ['2', '3']:
        X = X.merge((X[col] == 0).astype('int32'), on='id')
    
    for col in ['5', '7', '9']:
        X = X.merge((X[col] == 0).astype('int32'), on='id')
    
    return X

In [8]:
X_all = X_train.append(X_test)
X_all = modifyX(X_all, 5)
X_train = X_all[X_all.index.isin(X_train.index)]
X_test = X_all[X_all.index.isin(X_test.index)]
X_test.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(4058, 92)

In [9]:
a1 = 0.0
for t in range(10):
    model1 = lgb.LGBMClassifier(learning_rate=0.05, num_leaves=2,
                                n_estimators=309,
                                colsample_bytree=0.75, subsample=0.75, random_state=t)
    model1.fit(X_train, y1)
    a = model1.predict_proba(X_test)[:,1]
    # print (a)
    a1 += a
a1 = a1 / 10

In [10]:
a2 = 0.0
for t in range(10):
    model2 = lgb.LGBMClassifier(learning_rate=0.05, num_leaves=2,
                          n_estimators=284,
                          colsample_bytree=0.75, subsample=0.75, random_state=t)
    model2.fit(X_train, y2)
    a = model2.predict_proba(X_test)[:,1]
    a2 += a
a2 = a2 / 10

In [11]:
a3 = 0.0
for t in range(10):
    model3 = lgb.LGBMClassifier(learning_rate=0.05, num_leaves=3,
                          n_estimators=6,
                          colsample_bytree=0.75, subsample=0.75, random_state=t)
    model3.fit(X_train, y3)
    a = model3.predict_proba(X_test)[:,1]
    a3 += a
a3 = a3 / 10

In [12]:
a4 = 0.0
for t in range(10):
    model4 = lgb.LGBMClassifier(learning_rate=0.05, num_leaves=7,
                          n_estimators=10,
                          colsample_bytree=0.75, subsample=0.75, random_state=t)
    model4.fit(X_train, y4)
    a = model4.predict_proba(X_test)[:,1]
    a4 += a
a4 = a4 / 10

In [13]:
a5 = 0.0
for t in range(10):
    model5 = lgb.LGBMClassifier(learning_rate=0.05, num_leaves=9,
                          n_estimators=49,
                          colsample_bytree=0.75, subsample=0.75, random_state=t)
    model5.fit(X_train, y5)
    a = model5.predict_proba(X_test)[:,1]
    #print (a)
    a5 += a
a5 = a5 / 10

In [14]:
df = pd.DataFrame({'id': X_test.index,
             '1': a1,
             '2': a2,
             '3': a3,
             '4': a4,
             '5': a5})
df.to_csv('lbm_sol.csv', index=False)
df.head()

Unnamed: 0,id,1,2,3,4,5
0,0,0.372175,0.338914,0.309348,0.292788,0.448865
1,1,0.299693,0.266502,0.308948,0.293758,0.640844
2,2,0.415403,0.402549,0.309348,0.286239,0.517462
3,4,0.225311,0.252785,0.309348,0.299364,0.53778
4,7,0.267182,0.186897,0.365074,0.370087,0.336876
