In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reference

* I have taken EDA reference from below kernel please check that notebook for EDA as well.
* [Netflix Appetency : Stater EDA + Model](https://www.kaggle.com/gopidurgaprasad/netflix-appetency-stater-eda-model)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from tqdm import tqdm

tqdm.pandas()
from plotly.subplots import make_subplots

In [None]:
train_df = pd.read_csv('../input/netflix-appetency/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/netflix-appetency/test.csv')
test_df.head()

In [None]:
train_df.describe()

# EDA

In [None]:
null_count_df =  train_df.isna().sum(axis=0).sort_values(ascending=False).to_frame().T
null_count_df = null_count_df[null_count_df > 0].dropna(axis=1)

display(null_count_df)
null_count_df.columns

## Drop full null columns

In [None]:
full_null_cols = ['feature_193', 'feature_196', 'feature_197', 'feature_198', 'feature_83']

train_df = train_df.drop(columns=full_null_cols)
test_df = test_df.drop(columns=full_null_cols)

train_df.shape, test_df.shape

In [None]:
train_df.info()

## Categorical Columns

In [None]:
display(train_df.select_dtypes('object').head())
# date columns
date_cols = ['feature_191', 'feature_192', 'feature_194', 'feature_195', 'feature_199', 'feature_200', 'feature_201', 'feature_202', 'feature_203', 'feature_204']
display(train_df[date_cols].head())

## Drop Date Columns

In [None]:
train_df = train_df.drop(columns=date_cols)
test_df = test_df.drop(columns=date_cols)

train_df.shape, test_df.shape

## Droping Single and More than 100 Categories in Categorical Columns

In [None]:
cat_cols = train_df.select_dtypes('object').nunique().sort_values().to_frame().T
display(cat_cols)
drop_cols = cat_cols[(cat_cols == 1) | (cat_cols >= 100)].dropna(axis=1).columns
drop_cols

In [None]:
train_df = train_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

train_df.shape, test_df.shape

## Numerical Columns

In [None]:
train_df.select_dtypes(['int', 'float']).head()

In [None]:
target_column = 'target'
drop_columns = ['id', target_column]

num_columns = [col for col in train_df.select_dtypes(['int', 'float']).columns if col not in drop_columns]
train_df[num_columns].head()

In [None]:
num_unique = train_df[num_columns].nunique().sort_values().to_frame().T
one_num_df = num_unique[num_unique == 1].dropna(axis=1)
one_num_df.columns

In [None]:
train_df = train_df.drop(columns=one_num_df.columns)
test_df = test_df.drop(columns=one_num_df.columns)

train_df.shape, test_df.shape

# Correlation Between Columns

## Columns with more 0.9 correlation

In [None]:
corr_train_df = train_df.select_dtypes(['int', 'float']).drop(columns=['id']).corr()
corr_train_df = corr_train_df

corr_cols = corr_train_df.columns
for i in range(len(corr_cols)):
    for j in range(i+1, len(corr_cols)):
        if corr_train_df.iloc[i, j] >= 0.9:
            print(corr_cols[i], corr_cols[j], corr_train_df.iloc[i, j])

* We can see some columns have `1.0` correaltion they are same absolutely.
* Most of them are highly correlated

# Target Distribtion

In [None]:
fig = plt.figure(figsize=(12, 9))
print("Percentage of 1 Labels: ", (train_df['target'].sum() * 100) / train_df.shape[0])
print("Percentage of 0 Labels: ", (1 - (train_df['target'].sum()/ train_df.shape[0])) * 100)
sns.countplot(train_df['target'], )
plt.show()

# Target Correlation with Features

In [None]:
# Target Correlation with 
target_corr = corr_train_df.loc['target', :].sort_values(ascending=False).to_frame().T
display(target_corr)

fig = plt.figure(figsize=(25, 5))
target_corr.iloc[0, 1:].plot()
plt.show()

# PCA 

In [None]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

## Select only numerical columns

In [None]:
train_num_df = train_df.select_dtypes(['int', 'float']).drop(columns=['id', 'target'])

## Simple Imputing and MinMax Scaling

In [None]:
simple_imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler()
train_imputed_arr = simple_imputer.fit_transform(train_num_df)
train_imputer_scaled_arr = scaler.fit_transform(train_imputed_arr)
train_imputed_df = pd.DataFrame(train_imputer_scaled_arr, columns=train_num_df.columns)

## Check PCA with different number of components

In [None]:
# for components in [10, 20, 25, 30, 35]:
#     pca = PCA(n_components=components)
#     pca.fit(train_imputed_df)
#     print(f"Components [{components}] - {pca.explained_variance_ratio_}")

In [None]:
pca = PCA(n_components=50)
train_num_pca_df = pd.DataFrame(pca.fit_transform(train_imputed_df))
train_num_pca_df.head()

In [None]:
test_num_df = test_df.select_dtypes(['int', 'float']).drop(columns=['id'])
test_imputed_arr = simple_imputer.transform(test_num_df)
test_imputer_scaled_arr = scaler.transform(test_imputed_arr)
test_imputed_df = pd.DataFrame(test_imputer_scaled_arr, columns=test_num_df.columns)

test_num_pca_df = pd.DataFrame(pca.transform(test_imputed_df))
test_num_pca_df.head()

## Prepare training data with PCA

In [None]:
train_pca_df = pd.concat([train_num_pca_df, train_df.select_dtypes('object'), train_df[['target']]], axis=1)
train_pca_df.head()

## Prepare training data with PCA

In [None]:
test_pca_df = pd.concat([test_num_pca_df, test_df.select_dtypes('object')], axis=1)
test_pca_df.head()

In [None]:
!pip install pycaret

# Pycaret AutoML

In [None]:
from pycaret.classification import *

In [None]:
high_card_feat = train_pca_df.select_dtypes('object').nunique().sort_values(ascending=False).to_frame().T
display(high_card_feat)
high_card_feat.columns

In [None]:
# len(high_card_feat[high_card_feat > 7].dropna(axis=1).columns) # columns more 7 categorical features

In [None]:
clf1 = setup(data=train_pca_df, 
             target='target',
             remove_multicollinearity=True,
             feature_selection=True,
             combine_rare_levels=True,
             fix_imbalance=True,
             silent=True,
             high_cardinality_features=list(high_card_feat.columns[:20]),
             high_cardinality_method='frequency')

In [None]:
best = compare_models(['lightgbm', 'xgboost', 'catboost', 'rf', 'ada'])

In [None]:
lgbm = create_model('lightgbm')

In [None]:
lgbm = tune_model(lgbm, optimize='AUC')

In [None]:
evaluate_model(lgbm)

In [None]:
interpret_model(lgbm)

In [None]:
test_pred_df = predict_model(lgbm, data = test_pca_df)

In [None]:
test_pred_df

In [None]:
test_pred_df.loc[test_pred_df[test_pred_df['Label'] == 0].index, 'Score'] = 1 - test_pred_df[test_pred_df['Label'] == 0]['Score']

In [None]:
test_pred_df

In [None]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'target': test_pred_df['Score']
})

In [None]:
submission_df.to_csv('submission.csv', index=False)