In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will 
#list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as 
#output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**NB:**<br>
To expedite the program I use gpu server, and I used cuML <br>
The API for the cuML is in the given reference <br>
https://docs.rapids.ai/api/cuml/stable/api.html#logistic-regression

## 1. Loading the Data

In [2]:
#Load the training and the test data
train_filepath = "/kaggle/input/tabular-playground-series-sep-2021/train.csv"
test_filepath = "/kaggle/input/tabular-playground-series-sep-2021/test.csv"
train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

### Description about the data
There is a set of variables F = {f1, f2, f3, ...f118} and the target variable "claim". The value of a target variable is digital i.e., it's value is 0 or 1. It is a decisional statement implying that whether the claim will be made or not. The claim = 0, implies that it will not be paid while the claim = 1, implies, that the claim will be made.

**Our job is to predict the claim for the given set of {f1, f2, f3, ...f118}**

In [3]:
#Display the feautres of train and test datasets
print(train_data.info())

Therefore, there are in total 118 variables with dtype float, while the id and claim is integer.

## 2. Data Visualization

In [4]:
#Display few lines of the train and the test datasets
print(train_data.head())
#print(test_data.head())

In [5]:
#Summarize the datasets of training and test datasets
print(train_data.describe())
print(test_data.describe())

In [6]:
#Lets visulaize the variation of target variable in the train data
import matplotlib.pyplot as plt
train_data['claim'].value_counts().plot.bar()
plt.xlabel('claim')
plt.ylabel('frequency')
plt.title('Distribution of claim paid and unpaid')

**From the bar plots it is apparent that the there is an equal number of claims that are paid and unpaid.**<br>
The input variables are of unkonown type, therefore in order to viualize them it is better to see correlation between these variables.

## 3. Check for the missing variables

Here we will check the values in the columns, and in particular check the missing value, or catgorical variables.

In [7]:
print("Train:")
print(train_data.isnull().sum())
print()

print("Test:")
print(test_data.isnull().sum())
print()

From the above set of outputs, it is evident that there are significant number of null values in both training and test data set i,e., several values are missing. Therefore, we will **preprocess the data** to remove all the rows with missing values. 
##### Lets just take few columns/rows for out analysis

## 4. Data Preprocessing

Segregate the target and train variables from the training data and drop all the rows with null values.

In [8]:
trainData = train_data.dropna(axis=0)
testData = test_data.dropna(axis = 0)

Lets now check the modified set of data


In [9]:
print("Modified Train:")
print(trainData.isnull().sum())
print()

print("Modified Test:")
print(testData.isnull().sum())
print()

Now all the null values are removed in traing and test dataset

In [10]:
import numpy as np
print(np.shape(trainData))
print(np.shape(testData))

## 5. Model Verification

##### Lets play with the training data set "trainData"

In [11]:
#We modify the training data set
y = train_data['claim']
X = train_data.drop("claim", axis=1)

#### Split the data into train and validation data

In [12]:
#Split the data into train and validation data
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, random_state=0)

#### Model 2: XGBoost Rgressor

In [13]:
#from xgboost import XGBRegressor
#from sklearn.metrics import mean_absolute_error
#from sklearn.metrics import mean_squared_error

#XGB_modelR = XGBRegressor(base_score=0.5, booster='gbtree', eval_metric='mlogloss',
 #             gamma=0, tree_method = 'gpu_hist', gpu_id=-1,learning_rate=0.01,
  #            max_delta_step=0, max_depth=6, objective = 'reg:squarederror', n_estimators=60, random_state=0,
  #            reg_alpha=0, reg_lambda=1, use_label_encoder=False,
   #           validate_parameters=1,verbosity=None) # 
#XGB_modelR.fit(x_train, y_train)
#y_predict4 = XGB_modelR.predict(x_valid)
#print("Mean Absolute Error", mean_absolute_error(y_valid, y_predict4))
#print("Mean Squared Error", mean_squared_error(y_valid, y_predict4))

## 6. Prediction 

In [14]:
#Using Model 2: XGBoost 
#y_testXGB_R = XGB_modelR.predict(test_data)#.drop("id", axis = 1))
#print("predicted values:", y_testXGB_R)

#import matplotlib.pyplot as plt
#plt.hist(y_testXGB_R, 20, ec = 'black')

In [15]:
#Build the light LGBM model
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
model_LGB = lgb.LGBMRegressor(boosting_type='gbdt',device = "gpu", num_leaves= 31, max_depth=20,
                  learning_rate=0.1, objective= 'regression', n_estimators=60, n_jobs=-1,
                              subsample_for_bin=200000)

model_LGB.fit(x_train, y_train)
y_predict_lgb = model_LGB.predict(x_valid)
print("Mean Absolute Error", mean_absolute_error(y_valid, y_predict_lgb))
print("Mean Squared Error", mean_squared_error(y_valid, y_predict_lgb))

In [16]:
#Using Model 3: LGBM 
y_test_lgb= model_LGB.predict(test_data)
print("predicted values:", y_test_lgb)

import matplotlib.pyplot as plt
plt.hist(y_test_lgb, 100, ec = 'black')
plt.xlabel('predicted claim')
plt.ylabel('count')
plt.show()

In [17]:
print("predicted values:", y_test_lgb)

## 7. Prepare the submission file

In [18]:
my_submission = pd.DataFrame({'id': test_data.id, 'claim': y_test_lgb})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)