In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import SimpleImputer

from sklearn.metrics import r2_score

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.isna().sum()

id                       0
CARTYPE_07_CAT        5727
METRIC_18_BIN            0
METRIC_04                0
CONDITION_11_BIN         0
REGISTRATION_03      53709
CONDITION_05_CAT      2891
CARTYPE_15               0
METRIC_09                0
METRIC_01                0
REGISTRATION_01          0
CONDITION_14             0
METRIC_12                0
METRIC_08                0
METRIC_06                0
CONDITION_02_CAT       104
METRIC_10                0
CARTYPE_10_CAT           0
CONDITION_09_BIN         0
CARTYPE_03_CAT      205287
target                   0
CARTYPE_09_CAT         282
CARTYPE_11               2
CARTYPE_01_CAT          57
CONDITION_13_BIN         0
METRIC_11                0
METRIC_14                0
METRIC_17_BIN            0
CARTYPE_13               0
CARTYPE_06_CAT           0
METRIC_15_BIN            0
CONDITION_03             0
METRIC_20_BIN            0
METRIC_07                0
CARTYPE_02_CAT           2
CONDITION_01             0
CONDITION_18_BIN         0
C

In [4]:
df.duplicated().any()

False

In [5]:
X_fill = df

In [6]:
imputer = SimpleImputer(fill_value=np.nan, strategy='mean')

X_fill = imputer.fit_transform(df)

In [7]:
X_fill = pd.DataFrame(X_fill, columns=df.columns)

In [8]:
X_fill.isna().sum()

id                  0
CARTYPE_07_CAT      0
METRIC_18_BIN       0
METRIC_04           0
CONDITION_11_BIN    0
REGISTRATION_03     0
CONDITION_05_CAT    0
CARTYPE_15          0
METRIC_09           0
METRIC_01           0
REGISTRATION_01     0
CONDITION_14        0
METRIC_12           0
METRIC_08           0
METRIC_06           0
CONDITION_02_CAT    0
METRIC_10           0
CARTYPE_10_CAT      0
CONDITION_09_BIN    0
CARTYPE_03_CAT      0
target              0
CARTYPE_09_CAT      0
CARTYPE_11          0
CARTYPE_01_CAT      0
CONDITION_13_BIN    0
METRIC_11           0
METRIC_14           0
METRIC_17_BIN       0
CARTYPE_13          0
CARTYPE_06_CAT      0
METRIC_15_BIN       0
CONDITION_03        0
METRIC_20_BIN       0
METRIC_07           0
CARTYPE_02_CAT      0
CONDITION_01        0
CONDITION_18_BIN    0
CONDITION_16_BIN    0
CONDITION_07_BIN    0
CONDITION_17_BIN    0
CARTYPE_11_CAT      0
CONDITION_10_BIN    0
REGISTRATION_02     0
METRIC_16_BIN       0
CONDITION_15        0
METRIC_03 

In [9]:
X = X_fill.drop([X_fill.columns[20]], axis = 1)

Y = X_fill[X_fill.columns[20]]


In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=0)

In [11]:
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

In [12]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(x_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(x_test, y_test)))

Accuracy of GNB classifier on training set: 0.90
Accuracy of GNB classifier on test set: 0.90


In [13]:
y_pred = gnb.predict(x_test)

print(accuracy_score(y_test, y_pred))

0.9033063775453058


In [14]:
pred = gnb.predict(x_test)

print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[80233  5823]
 [ 2810   416]]
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95     86056
         1.0       0.07      0.13      0.09      3226

    accuracy                           0.90     89282
   macro avg       0.52      0.53      0.52     89282
weighted avg       0.93      0.90      0.92     89282



In [15]:
logreg = LogisticRegression(multi_class= 'multinomial', class_weight = 'balanced')
logreg.fit(x_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(x_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(x_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.62
Accuracy of Logistic regression classifier on test set: 0.62


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
y1_pred = logreg.predict(x_test)

print(accuracy_score(y_test, y1_pred))

0.6178177012163706


In [17]:
pred1 = logreg.predict(x_test)

print(confusion_matrix(y_test, pred1))

print(classification_report(y_test, pred1))

[[53421 32635]
 [ 1487  1739]]
              precision    recall  f1-score   support

         0.0       0.97      0.62      0.76     86056
         1.0       0.05      0.54      0.09      3226

    accuracy                           0.62     89282
   macro avg       0.51      0.58      0.43     89282
weighted avg       0.94      0.62      0.73     89282



In [18]:
db = pd.read_csv('test.csv')

In [19]:
db.isna().sum()

id                       0
CARTYPE_07_CAT        5762
METRIC_18_BIN            0
METRIC_04                0
CONDITION_11_BIN         0
REGISTRATION_03      54063
CONDITION_05_CAT      2918
CARTYPE_15               0
METRIC_09                0
METRIC_01                0
REGISTRATION_01          0
CONDITION_14             0
METRIC_12                0
METRIC_08                0
METRIC_06                0
CONDITION_02_CAT       112
METRIC_10                0
CARTYPE_10_CAT           0
CONDITION_09_BIN         0
CARTYPE_03_CAT      205944
CARTYPE_09_CAT         287
CARTYPE_11               3
CARTYPE_01_CAT          50
CONDITION_13_BIN         0
METRIC_11                0
METRIC_14                0
METRIC_17_BIN            0
CARTYPE_13               0
CARTYPE_06_CAT           0
METRIC_15_BIN            0
CONDITION_03             0
METRIC_20_BIN            0
METRIC_07                0
CARTYPE_02_CAT           3
CONDITION_01             0
CONDITION_18_BIN         0
CONDITION_16_BIN         0
C

In [20]:
X1 = db

In [21]:
imputer = SimpleImputer(fill_value=np.nan, strategy='mean')

X1 = imputer.fit_transform(db)

In [22]:
X1 = pd.DataFrame(X1, columns=db.columns)

In [23]:
X1.isna().sum()

id                  0
CARTYPE_07_CAT      0
METRIC_18_BIN       0
METRIC_04           0
CONDITION_11_BIN    0
REGISTRATION_03     0
CONDITION_05_CAT    0
CARTYPE_15          0
METRIC_09           0
METRIC_01           0
REGISTRATION_01     0
CONDITION_14        0
METRIC_12           0
METRIC_08           0
METRIC_06           0
CONDITION_02_CAT    0
METRIC_10           0
CARTYPE_10_CAT      0
CONDITION_09_BIN    0
CARTYPE_03_CAT      0
CARTYPE_09_CAT      0
CARTYPE_11          0
CARTYPE_01_CAT      0
CONDITION_13_BIN    0
METRIC_11           0
METRIC_14           0
METRIC_17_BIN       0
CARTYPE_13          0
CARTYPE_06_CAT      0
METRIC_15_BIN       0
CONDITION_03        0
METRIC_20_BIN       0
METRIC_07           0
CARTYPE_02_CAT      0
CONDITION_01        0
CONDITION_18_BIN    0
CONDITION_16_BIN    0
CONDITION_07_BIN    0
CONDITION_17_BIN    0
CARTYPE_11_CAT      0
CONDITION_10_BIN    0
REGISTRATION_02     0
METRIC_16_BIN       0
CONDITION_15        0
METRIC_03           0
METRIC_05 

In [24]:
X1 = scaler.transform(X1)

logreg.fit(x_train, y_train)

result = logreg.predict(X1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
result

array([1., 0., 0., ..., 1., 0., 0.])

In [26]:
db["target"] = result

to_submit = db[['id',"target"]]

to_submit.to_csv("submission2.csv", index = False)