In [1]:
import pandas as pd
import numpy as np


In [2]:
# read the data
train_raw = pd.read_csv("./dataset/train.csv")
test_raw = pd.read_csv("./dataset/test.csv")    


In [3]:
# check the data shape
print("train data shape: ", train_raw.shape)
print("test data shape: ", test_raw.shape)

train data shape:  (25000, 5480)
test data shape:  (5000, 5479)


In [4]:
train_raw.head()

Unnamed: 0,id,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_5470,var_5471,var_5472,var_5473,var_5474,var_5475,var_5476,var_5477,var_5478,target
0,id_1,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.355422,...,,,,,,,,,,0
1,id_2,0.1875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150602,...,0.0,,0.0,,,,,,,0
2,id_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,,,0.008333,,,0.0,,,0.0,0
3,id_4,0.15625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,...,,,,,,,,,,0
4,id_5,0.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096386,...,,,,,,,,,,0


In [5]:
train_raw.target.value_counts(normalize=True)

0    0.85316
1    0.14684
Name: target, dtype: float64

In [6]:
# check for missing values
train_raw.isnull().mean()

id          0.00000
var_1       0.08496
var_2       0.08496
var_3       0.08496
var_4       0.08496
             ...   
var_5475    0.95896
var_5476    0.96844
var_5477    0.99972
var_5478    0.89720
target      0.00000
Length: 5480, dtype: float64

In [7]:
# columns with missing values > 50%
null50 = train_raw.isnull().mean()[train_raw.isnull().mean() > 0.5].index

print(null50)

Index(['var_11', 'var_12', 'var_13', 'var_15', 'var_16', 'var_17', 'var_19',
       'var_20', 'var_21', 'var_23',
       ...
       'var_5469', 'var_5470', 'var_5471', 'var_5472', 'var_5473', 'var_5474',
       'var_5475', 'var_5476', 'var_5477', 'var_5478'],
      dtype='object', length=2441)


In [8]:
# drop columns with missing values > 50%
train_raw.drop(null50, axis=1, inplace=True)
test_raw.drop(null50, axis=1, inplace=True)


In [9]:
# drop id columns
train_raw.drop(['id'], axis=1, inplace=True)
test_raw.drop(['id'], axis=1, inplace=True)

print("train data shape: ", train_raw.shape)
print("test data shape: ", test_raw.shape)



train data shape:  (25000, 3038)
test data shape:  (5000, 3037)


In [10]:
# check for constant columns
const_cols = train_raw.nunique()[train_raw.nunique() == 1].index

# drop constant columns
train_raw.drop(const_cols, axis=1, inplace=True)
test_raw.drop(const_cols, axis=1, inplace=True)

print("train data shape: ", train_raw.shape)
print("test data shape: ", test_raw.shape)


train data shape:  (25000, 1024)
test data shape:  (5000, 1023)


In [11]:
# check for duplicated columns
dup_cols = train_raw.T.duplicated()[train_raw.T.duplicated() == True].index

# drop duplicated columns
train_raw.drop(dup_cols, axis=1, inplace=True)
test_raw.drop(dup_cols, axis=1, inplace=True)

print("train data shape: ", train_raw.shape)
print("test data shape: ", test_raw.shape)

train data shape:  (25000, 997)
test data shape:  (5000, 996)


In [12]:
# impute missing values with the median
train_raw.fillna(train_raw.median(), inplace=True)
test_raw.fillna(test_raw.median(), inplace=True)


In [13]:
train_raw.isnull().mean()

var_1       0.0
var_2       0.0
var_3       0.0
var_4       0.0
var_5       0.0
           ... 
var_5461    0.0
var_5462    0.0
var_5463    0.0
var_5466    0.0
target      0.0
Length: 997, dtype: float64

In [14]:
# check the null values
# train_raw.isnull()

#fill null values with the median
train_raw.fillna(train_raw.median(), inplace=True)


In [15]:
train_raw.isnull().mean()

var_1       0.0
var_2       0.0
var_3       0.0
var_4       0.0
var_5       0.0
           ... 
var_5461    0.0
var_5462    0.0
var_5463    0.0
var_5466    0.0
target      0.0
Length: 997, dtype: float64

In [16]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Columns: 997 entries, var_1 to target
dtypes: float64(996), int64(1)
memory usage: 190.2 MB


In [23]:
# check for infinite values
print(np.isinf(train_raw).sum().any())

print(np.isinf(test_raw).sum().any())

True
True


In [24]:
#print the columns with infinite values

train_raw.columns[np.isinf(train_raw).any()]

Index(['var_5213'], dtype='object')

In [25]:
train_raw.var_5213.value_counts()

inf         12600
0.000588       27
0.001030        8
0.001133        8
0.000845        8
            ...  
0.000428        1
0.001762        1
0.000271        1
0.000710        1
0.000782        1
Name: var_5213, Length: 11935, dtype: int64

In [17]:
# create a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

# split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_raw.drop(['target'], axis=1), train_raw.target, test_size=0.2, random_state=42, stratify=train_raw.target
)

# create a logistic regression model
model = LogisticRegression()

# fit the model
model.fit(X_train, y_train)

# predict on the validation set
y_pred = model.predict(X_val)

# calculate the f1 score
f1_score(y_val, y_pred)

# calculate the roc auc score
roc_auc_score(y_val, y_pred)

# calculate the accuracy score
accuracy_score(y_val, y_pred)



ValueError: Input X contains infinity or a value too large for dtype('float64').

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_7,var_8,var_9,var_10,var_14,...,var_5455,var_5456,var_5457,var_5458,var_5460,var_5461,var_5462,var_5463,var_5466,target
0,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.355422,0.106633,0.192716,...,0.019802,0.011905,0.012270,0.008547,0.016317,0.013158,0.006711,0.000000,0.000000,0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.049505,0.047619,0.079755,0.025641,0.065268,0.026316,0.020134,0.068627,0.036745,0
3,0.156250,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.192948,...,0.009901,0.005952,0.018405,0.025641,0.006993,0.013158,0.000000,0.009804,0.000000,0
4,0.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.096386,0.106633,0.192716,...,0.118812,0.017857,0.042945,0.025641,0.055944,0.000000,0.006711,0.009804,0.005249,0
5,0.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.042169,0.106633,0.192716,...,0.019802,0.005952,0.000000,0.025641,0.011655,0.000000,0.006711,0.000000,0.007874,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24988,0.187500,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.107005,0.202121,...,0.118812,0.119048,0.104294,0.094017,0.146853,0.118421,0.087248,0.117647,0.115486,0
24989,0.062500,0.0,0.0,0.0,0.0,0.0,0.0,0.060241,0.106633,0.192716,...,0.019802,0.011905,0.018405,0.051282,0.020979,0.013158,0.000000,0.009804,0.005249,1
24993,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.128713,0.059524,0.171779,0.068376,0.144522,0.052632,0.033557,0.137255,0.073491,0
24994,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.178218,0.017857,0.018405,0.025641,0.041958,0.052632,0.006711,0.009804,0.010499,0


  df[df.isin([np.nan, np.inf, -np.inf]).any(1)]


Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_7,var_8,var_9,var_10,var_14,...,var_5455,var_5456,var_5457,var_5458,var_5460,var_5461,var_5462,var_5463,var_5466,target
0,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.355422,0.106633,0.192716,...,0.019802,0.011905,0.012270,0.008547,0.016317,0.013158,0.006711,0.000000,0.000000,0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.049505,0.047619,0.079755,0.025641,0.065268,0.026316,0.020134,0.068627,0.036745,0
3,0.156250,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.192948,...,0.009901,0.005952,0.018405,0.025641,0.006993,0.013158,0.000000,0.009804,0.000000,0
4,0.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.096386,0.106633,0.192716,...,0.118812,0.017857,0.042945,0.025641,0.055944,0.000000,0.006711,0.009804,0.005249,0
5,0.046875,0.0,0.0,0.0,0.0,0.0,0.0,0.042169,0.106633,0.192716,...,0.019802,0.005952,0.000000,0.025641,0.011655,0.000000,0.006711,0.000000,0.007874,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24988,0.187500,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.107005,0.202121,...,0.118812,0.119048,0.104294,0.094017,0.146853,0.118421,0.087248,0.117647,0.115486,0
24989,0.062500,0.0,0.0,0.0,0.0,0.0,0.0,0.060241,0.106633,0.192716,...,0.019802,0.011905,0.018405,0.051282,0.020979,0.013158,0.000000,0.009804,0.005249,1
24993,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.128713,0.059524,0.171779,0.068376,0.144522,0.052632,0.033557,0.137255,0.073491,0
24994,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.018072,0.106633,0.193421,...,0.178218,0.017857,0.018405,0.025641,0.041958,0.052632,0.006711,0.009804,0.010499,0
