In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from lightgbm import LGBMClassifier

# from cuml.svm import 

In [2]:
train_df = pd.read_csv('playground-series-s4e8/train.csv')
test_df = pd.read_csv('playground-series-s4e8/test.csv')

In [3]:
train_df['class'].unique()

array(['e', 'p'], dtype=object)

In [4]:
print(train_df.shape, test_df.shape)

(3116945, 22) (2077964, 21)


In [5]:
train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [6]:
null_percent = train_df.isnull().mean()*100
null_percent_df = null_percent.reset_index()
null_percent_df.columns = ['Column', 'Null percentage']
null_percent_df.sort_values(by='Null percentage', ascending=False).head(10)

Unnamed: 0,Column,Null percentage
15,veil-type,94.88435
19,spore-print-color,91.425482
12,stem-root,88.452732
16,veil-color,87.93697
13,stem-surface,63.551362
8,gill-spacing,40.373988
4,cap-surface,21.528227
7,gill-attachment,16.80928
18,ring-type,4.134818
9,gill-color,0.001829


In [7]:
train_df['stem-surface'].unique()

array([nan, 'y', 's', 't', 'g', 'h', 'k', 'i', 'f', 'l', 'd', 'x',
       '12.04', 'w', 'a', 'o', 'c', 'n', 'm', 'e', 'p', 'z', '6.58',
       '4.34', 'b', '3.89', 'r', '25.83', '1.59', '0.0', '5.97', '5.81',
       'u', 'season', '10.48', '3.68', '5.56', '4.41', '5.48', '5.51',
       'class', 'has-ring', '13.1', '17.46', '5.35', '7.23', 'does None',
       '1.03', 'does s', '7.45', 'has h', 'does-bruise-or-bleed', '1.94',
       '49.46', '19.35', '2.68', '4.74', 'spore-print-color', '10.93',
       '24.12', '13.94'], dtype=object)

In [8]:
mappings = list()

encoder = LabelEncoder()

for column in range(len(train_df.columns)):
    train_df[train_df.columns[column]] = encoder.fit_transform(train_df[train_df.columns[column]])
    mappings_dict = {index: label for index, label in enumerate(encoder.classes_)}
    mappings.append(mappings_dict)

In [9]:
X = train_df.drop('class', axis =1)
y = train_df['class']

In [10]:
X

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,843,53,72,72,8,44,28,59,353,...,38,60,55,22,24,5,18,32,25,0
1,1,414,71,56,64,8,44,28,46,381,...,38,58,47,22,24,18,39,32,25,3
2,2,657,53,72,49,8,75,28,59,587,...,38,51,46,22,24,5,18,32,36,3
3,3,351,53,81,57,8,70,48,37,318,...,38,60,55,22,24,5,18,32,25,2
4,4,548,71,65,74,8,47,48,59,239,...,38,60,55,22,24,5,18,32,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,892,53,83,63,20,78,48,59,1116,...,15,60,55,19,21,18,19,32,25,2
3116941,3116941,1051,67,83,74,20,47,28,48,567,...,38,60,55,22,24,5,18,32,25,2
3116942,3116942,745,71,53,55,8,44,48,59,853,...,38,60,57,22,21,18,39,32,25,0
3116943,3116943,908,64,59,63,20,52,48,48,815,...,38,58,55,22,24,18,27,32,25,2


In [11]:
id_column = X[['id']]
features_to_scale = X.drop(columns=['id'])
scaler = StandardScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(features_to_scale), columns=features_to_scale.columns)
# X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled = pd.concat([id_column, scaled_features], axis=1)

In [12]:
X_scaled

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,0.628195,-1.111068,0.168251,1.039717,-0.461414,-1.131266,-0.924965,0.895706,-0.684029,...,0.326057,0.598098,0.785542,0.231359,0.286431,-0.561952,-0.366675,0.28045,-0.535774,-0.954002
1,1,-0.423783,0.879150,-1.150399,-0.162786,-0.461414,-1.131266,-0.924965,-0.562032,-0.579746,...,0.326057,0.275546,-0.540144,0.231359,0.286431,1.779484,3.136604,0.28045,-0.535774,1.762755
2,2,0.172092,-1.111068,0.168251,-2.417478,-0.461414,1.141632,-0.924965,0.895706,0.187478,...,0.326057,-0.853386,-0.705855,0.231359,0.286431,-0.561952,-0.366675,0.28045,2.117828,1.762755
3,3,-0.578270,-1.111068,0.909991,-1.214976,-0.461414,0.775036,1.202032,-1.571236,-0.814383,...,0.326057,0.598098,0.785542,0.231359,0.286431,-0.561952,-0.366675,0.28045,-0.535774,0.857169
4,4,-0.095193,0.879150,-0.408659,1.340342,-0.461414,-0.911308,1.202032,0.895706,-1.108610,...,0.326057,0.598098,0.785542,0.231359,0.286431,-0.561952,-0.366675,0.28045,0.429172,-0.954002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,0.748351,-1.111068,1.074822,-0.313099,2.167139,1.361590,1.202032,0.895706,2.157682,...,-3.688816,0.598098,0.785542,-4.284471,-1.420459,1.779484,-0.199852,0.28045,-0.535774,0.857169
3116941,3116941,1.138245,0.436879,1.074822,1.340342,2.167139,-0.911308,-0.924965,-0.337765,0.112991,...,0.326057,0.598098,0.785542,0.231359,0.286431,-0.561952,-0.366675,0.28045,-0.535774,0.857169
3116942,3116942,0.387883,0.879150,-1.397646,-1.515601,-0.461414,-1.131266,1.202032,0.895706,1.178167,...,0.326057,0.598098,1.116964,0.231359,-1.420459,1.779484,3.136604,0.28045,-0.535774,-0.954002
3116943,3116943,0.787585,0.105176,-0.903152,-0.313099,2.167139,-0.544712,1.202032,-0.337765,1.036640,...,0.326057,0.275546,0.785542,0.231359,0.286431,1.779484,1.134730,0.28045,-0.535774,0.857169


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, train_size=0.8, random_state=42)

In [14]:
X_train.shape, y_train.shape

((2493556, 21), (2493556,))

In [15]:
clf = LGBMClassifier(objective='binary', metric='binary_error', num_levels=61, \
                        learning_rate=0.1, n_estimators=700, max_depth=8, random_state=42)

In [16]:
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1364404, number of negative: 1129152
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1234
[LightGBM] [Info] Number of data points in the train set: 2493556, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547172 -> initscore=0.189251
[LightGBM] [Info] Start training from score 0.189251


In [17]:
y_pred = clf.predict(X_test)
matthews_corrcoef(y_test,y_pred)
# np.float64(0.9830327207621253)



0.9830224086235888

In [67]:
test_df = pd.read_csv('playground-series-s4e8/test.csv')
print(test_df.shape)
print(test_df.head())

(2077964, 21)
        id  cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed  \
0  3116945          8.64         x         NaN         n                    t   
1  3116946          6.90         o           t         o                    f   
2  3116947          2.00         b           g         n                    f   
3  3116948          3.47         x           t         n                    f   
4  3116949          6.17         x           h         y                    f   

  gill-attachment gill-spacing gill-color  stem-height  ...  stem-root  \
0             NaN          NaN          w        11.13  ...          b   
1             NaN            c          y         1.27  ...        NaN   
2             NaN            c          n         6.18  ...        NaN   
3               s            c          n         4.98  ...        NaN   
4               p          NaN          y         6.73  ...        NaN   

  stem-surface stem-color veil-type veil-color has-rin

In [30]:
# mappings2 = list()

# encoder2 = LabelEncoder()

# for column in range(1,len(test_df.columns)):
#     test_df[test_df.columns[column]] = encoder2.fit_transform(test_df[test_df.columns[column]])
#     mappings_dict2 = {index: label for index, label in enumerate(encoder.classes_)}
#     mappings2.append(mappings_dict2)



# scaler2 = StandardScaler()
# test_df = pd.DataFrame(scaler2.fit_transform(test_df), columns=test_df.columns)
# test_df.shape

# print(test_df.shape)
# print(test_df.head())


In [62]:
test_df.select_dtypes(include=['object']).columns


Index(['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed',
       'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root',
       'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring',
       'ring-type', 'spore-print-color', 'habitat', 'season'],
      dtype='object')

In [68]:
mappings_test_df = list()

encoder_test_df = LabelEncoder()


for column in test_df.select_dtypes(include=['object']).columns:
    # Fit and transform the column with the encoder
    test_df[column] = encoder_test_df.fit_transform(test_df[column])
    
    # Create a mapping of the original labels
    mappings_dict_test_df = {index: label for index, label in enumerate(encoder_test_df.classes_)}
    mappings_test_df.append(mappings_dict_test_df)
    

In [69]:
print(test_df.shape)
print(test_df.head())

(2077964, 21)
        id  cap-diameter  cap-shape  cap-surface  cap-color  \
0  3116945          8.64         59           59         44   
1  3116946          6.90         50           53         45   
2  3116947          2.00         36           38         44   
3  3116948          3.47         59           53         44   
4  3116949          6.17         59           39         55   

   does-bruise-or-bleed  gill-attachment  gill-spacing  gill-color  \
0                    18               66            35          52   
1                     5               66            17          54   
2                     5               66            17          41   
3                     5               57            17          41   
4                     5               55            35          54   

   stem-height  ...  stem-root  stem-surface  stem-color  veil-type  \
0        11.13  ...          9            54          51         12   
1         1.27  ...         31            54

In [70]:
# X = train_df.drop('class', axis =1)
# y = train_df['class']


test_df_id_column = test_df[['id']]
test_df_features_to_scale = test_df.drop(columns=['id'])

# scaler = StandardScaler()
test_df_scaled_features = pd.DataFrame(scaler.fit_transform(test_df_features_to_scale), columns=test_df_features_to_scale.columns)
# X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
test_df_scaled = pd.concat([test_df_id_column, test_df_scaled_features], axis=1)

print(test_df_scaled.shape)
print(test_df_scaled.head())


(2077964, 21)
        id  cap-diameter  cap-shape  cap-surface  cap-color  \
0  3116945      0.498096   0.885694     1.143638  -0.203856   
1  3116946      0.126734  -0.103978     0.457409  -0.046777   
2  3116947     -0.919054  -1.643467    -1.258165  -0.203856   
3  3116948     -0.605318   0.885694     0.457409  -0.203856   
4  3116949     -0.029067   0.885694    -1.143794   1.524017   

   does-bruise-or-bleed  gill-attachment  gill-spacing  gill-color  \
0              2.168936         1.425144      1.206983    0.853561   
1             -0.461050         1.425144     -0.880941    1.097933   
2             -0.461050         1.425144     -0.880941   -0.490484   
3             -0.461050         0.628099     -0.880941   -0.490484   
4             -0.461050         0.450978      1.206983    1.097933   

   stem-height  ...  stem-root  stem-surface  stem-color  veil-type  \
0     1.772335  ...  -3.720853      0.604418    0.860469  -4.298337   
1    -1.880901  ...   0.321996      0.604418

In [71]:
y_pred = clf.predict(test_df_scaled)



In [72]:
y_pred

array([0, 1, 1, ..., 1, 0, 0])

In [75]:
# test_df = pd.read_csv('playground-series-s4e8/test.csv')
submission = pd.DataFrame(
    {
        'id':test_df_scaled['id'],
        'class':y_pred
    }
)
submission['class']=submission['class'].replace(1,'p')
submission['class']=submission['class'].replace(0,'e')

submission.to_csv('submission.csv',index=False)


In [76]:
submission.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e


In [77]:
submission['class'].value_counts()

# class
# p    1139934
# e     938030
# Name: count, dtype: int64

class
p    1140246
e     937718
Name: count, dtype: int64

In [41]:
test= pd.read_csv('playground-series-s4e8/test.csv')
train = pd.read_csv('playground-series-s4e8/train.csv')

In [42]:
train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [43]:
test.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a
2,3116947,2.0,b,g,n,f,,c,n,6.18,...,,,n,,,f,f,,d,s
3,3116948,3.47,x,t,n,f,s,c,n,4.98,...,,,w,,n,t,z,,d,u
4,3116949,6.17,x,h,y,f,p,,y,6.73,...,,,y,,y,t,,,d,u
