In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, train_test_split

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/allstate-claims-severity/train.csv')
test = pd.read_csv('../input/allstate-claims-severity/test.csv')
sample = pd.read_csv('../input/allstate-claims-severity/sample_submission.csv')

In [None]:
print(train.shape, test.shape, sample.shape)

In [None]:
train.head()

'loss' is our target variable. This is a regression problem.

## EDA and Preprocessing

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,8))
sns.distplot(train['loss'],kde=False, ax=ax[0])
sns.distplot(train['loss'],hist=False, ax=ax[1])

In [None]:
train.dtypes.value_counts()

We have 116 categorical features and 14 numerical features. 

In [None]:
train.isna().any().sum()

There are no missing values in the dataset.

In [None]:
train.describe()

Some insights,
1. For all numerical features, mean value is approx. 0.5 and  standard deviation is approx. 0.2
2. Median values varies from 0.45 to 0.55

Let's plot some numerical features first.

In [None]:
train.corr()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(train.corr(), annot=True)

1. cont1 and cont10,cont9 and cont6, cont10 and cont6 are highly correlated.
2. No feature is highly correlated with the 'loss' column.

In [None]:
train_correlations = train.drop(["loss"], axis=1).corr()
train_correlations = train_correlations.values.flatten()
train_correlations = train_correlations[train_correlations != 1]

test_correlations = test.corr()
test_correlations = test_correlations.values.flatten()
test_correlations = test_correlations[test_correlations != 1]

plt.figure(figsize=(20,5))
sns.distplot(train_correlations, color="Red", label="train")
sns.distplot(test_correlations, color="Green", label="test")
plt.xlabel("Correlation values found in train (except 1)")
plt.ylabel("Density")
plt.title("Are there correlations between features?"); 
plt.legend();

There are some features with correlation values between 0.5 and 1. We need to remove one feature from such highly correlated feature pairs.

In [None]:
sns.pairplot(train.sample(frac=0.1), vars=['cont1', 'cont2', 'cont3', 'cont4', 'cont5','cont6', 'cont7'])

In [None]:
sns.pairplot(train.sample(frac=0.1), x_vars=['cont1', 'cont2', 'cont3', 'cont4', 'cont5','cont6', 'cont7'], y_vars=['cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14'])

In [None]:
sns.pairplot(train.sample(frac=0.1), vars=['cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14'])

In [None]:
sns.pairplot(train.sample(frac=0.1), x_vars=['cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14'], y_vars=['cont1', 'cont2', 'cont3', 'cont4', 'cont5','cont6', 'cont7'])

Highly correlated features:
1. cont1 and cont9
2. cont11 and cont12
3. cont10 and cont6

Let's remove one from these pairs.

In [None]:
train = train.drop(['cont1', 'cont11', 'cont10'], axis=1)
test = test.drop(['cont1', 'cont11', 'cont10'], axis=1)

Now, the categorical features.

In [None]:
fig,axes = plt.subplots(39,3,figsize=(20,180))
ax = axes.flatten()

for i in range(116):
    sns.countplot(train[f'cat{i+1}'], ax=ax[i])

Insights:
1. 'B' value is rare or less than 'A' in all features.
2. We can bin the features with more than 10 values, but for now let's go with label encoding.

In [None]:
cat_cols = train.select_dtypes(include='object').columns

le = LabelEncoder()

for i in cat_cols:
    test_unique = test[i].unique()
    train_unique = train[i].unique()
    labels = list(set(test_unique) | set(train_unique))
    
    le.fit(labels)
    train[i] = le.transform(train[i])
    test[i] = le.transform(test[i])

## Baseline Submission 

In [None]:
X = train.drop(['loss'], axis=1)
y = np.log(train['loss']+1)

In [None]:
X_train,X_val,y_train, y_val = train_test_split(X,y,test_size=0.1)

model = LGBMRegressor(n_estimators=300, learning_rate=0.1, random_state=123)
model.fit(X_train,y_train)
preds = model.predict(X_val)

print(mean_absolute_error(preds,y_val))

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,X.columns)), columns=['Value','Feature'])

fig,ax = plt.subplots(1,1,figsize=(20,30))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False), ax=ax)
plt.title('LightGBM Features')

In [None]:
test_predictions = model.predict(test)
sample['loss'] = test_predictions
sample.to_csv('submission.csv', index=False)

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(sample['loss'])

## References:

1. https://www.kaggle.com/sharmasanthosh/exploratory-study-on-ml-algorithms
Learned a lot from this amazing kernel.