In [1]:
# Essentials
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestClassifier


# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# Read in the dataset as a dataframe
train = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 52 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          100000 non-null  int64 
 1   feature_0   100000 non-null  int64 
 2   feature_1   100000 non-null  int64 
 3   feature_2   100000 non-null  int64 
 4   feature_3   100000 non-null  int64 
 5   feature_4   100000 non-null  int64 
 6   feature_5   100000 non-null  int64 
 7   feature_6   100000 non-null  int64 
 8   feature_7   100000 non-null  int64 
 9   feature_8   100000 non-null  int64 
 10  feature_9   100000 non-null  int64 
 11  feature_10  100000 non-null  int64 
 12  feature_11  100000 non-null  int64 
 13  feature_12  100000 non-null  int64 
 14  feature_13  100000 non-null  int64 
 15  feature_14  100000 non-null  int64 
 16  feature_15  100000 non-null  int64 
 17  feature_16  100000 non-null  int64 
 18  feature_17  100000 non-null  int64 
 19  feature_18  100000 non-n

In [3]:
# Split features and labels
train_labels = train['target'].reset_index(drop=True)
train_features = train.drop(['id','target'], axis=1)
test_features = test.drop(['id'], axis=1)
train_labels.head()

0    Class_2
1    Class_1
2    Class_1
3    Class_4
4    Class_2
Name: target, dtype: object

In [4]:
train_features.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0,0,1,0,1,0,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,1,0,0,0,0,13,2,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
test_features.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0,0,0,0,0,0,4,4,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,2,0,7,7,...,3,0,1,0,0,0,1,0,2,1
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,6,0
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,6,9,14,3
4,0,0,0,0,0,0,1,0,4,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model = rf.fit(train_features, train_labels)
pred = rf_model.predict_proba(test_features)

In [7]:
pred

array([[0.08, 0.5 , 0.24, 0.18],
       [0.07, 0.6 , 0.18, 0.15],
       [0.1 , 0.5 , 0.24, 0.16],
       ...,
       [0.08, 0.48, 0.19, 0.25],
       [0.05, 0.76, 0.04, 0.15],
       [0.09, 0.48, 0.32, 0.11]])

In [8]:
# Read in sample_submission dataframe
submission = pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv")
submission[['Class_1', 'Class_2', 'Class_3', 'Class_4']] = pred
submission.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.08,0.5,0.24,0.18
1,100001,0.07,0.6,0.18,0.15
2,100002,0.1,0.5,0.24,0.16
3,100003,0.1,0.52,0.23,0.15
4,100004,0.06,0.68,0.2,0.06


In [9]:
submission.to_csv("submission_basicrf.csv", index=False)