In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif, mutual_info_classif
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder

import statsmodels.api as sm
import scipy.stats as stats

from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import hvplot.pandas

## The Plan
1. Imports
2. Load our dataset
3. info, describe, head, check duplicates
   
4. Data Expolaration
   1. Complains distributions
   2. Spending Behavior and Complaints: Customers who have a higher amount deposited via counter or card, or those who make larger purchases (higher 'amount' and 'quantity'), might be more or less likely to complain depending on their spending patterns and their expectations of service.
   3. Points Accumulation and Complaints: There might be a relationship between the number of points a customer has in different categories ('restaurant_points', 'fuel_points', 'groceries_points', 'toys_points', 'cash_back_points') and their propensity to complain. For example, customers who have more points might have higher expectations and therefore might be more likely to complain, or alternatively, they might be more satisfied customers and thus less likely to complain.
   4. Coupon Usage and Complaints: It's possible that customers who use coupons ('used_coupon') are more price-sensitive and could be more likely to complain. Alternatively, if the coupon application process is not smooth, it might lead to complaints.
   5. Product Discount and Complaints: Customers who buy discounted products ('product_discounted') might have different expectations from those who buy full-priced products, which could affect their propensity to complain.
   6. Card Vendor and Complaints: There might be a relationship between the card vendor ('card_vendor') and the likelihood of a complaint, especially if certain card vendors are associated with more issues or better customer service.
   7. Gender and Complaints: The customer's gender ('cust_gender') might have some influence on the likelihood of a complaint, although it's important to be careful with assumptions based on demographic variables like this, as they can easily lead to unfair or biased models.
   8. Age and Complaints: The age of the customer ('cust_age') could also potentially be related to their propensity to complain.
   
5. Data pre-processing
   1. Correlation matrix
   2. Null Values
   3. Outliers
   4. Feature engineering
   5. Dataset Split
   
6. Machine Learning
   1. Random Forest with GridCVSearch
   2. Gradient Boost
   
7. Conclusion

### 1. Dataset

In [None]:
df = pd.read_csv("data/store_complains_dataset.csv")
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
duplicate_rows_count = df.duplicated().sum()

print(f'Duplicates rows {duplicate_rows_count}')

### 2. Data Expolaration

In [3]:
# hvplot.extension('bokeh')
# is_default_bar = df['complained'].value_counts().hvplot.bar(
#     title="Customer Complain Distribution", xlabel='User complained', ylabel='Count', 
#     width=500, height=350
# )

# is_default_bar

fig = px.bar(df, x='complained', title='Distribution of Complaints')


# Show the plot
fig.show()

In [None]:
import pandas as pd
import plotly.express as px

# Sample DataFrame
data = {
    'customer_registration_number': ['C001', 'C002', 'C003', 'C004', 'C005'],
    'amount_deposited_via_counter': [100, 200, 150, 300, 250],
    'amount_deposited_via_card': [50, 100, 75, 150, 125],
    'complained': ['no', 'yes', 'no', 'yes', 'yes'] # 0 for 'No' and 1 for 'Yes'
}

df = pd.DataFrame(data)


# Create a bar chart
fig = px.bar(df, x='complained', y=df['complained'].value_counts(), title='Distribution of Complaints')


# Show the plot
fig.show()


In [None]:
hvplot.extension('bokeh')
is_default_bar = df['complained'].value_counts().hvplot.bar(
    title="Complained", xlabel='User complained', ylabel='Count', 
    width=500, height=350
)

is_default_bar

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='viridis')

In [None]:
df.info()

In [None]:
df_to_train = df.drop(['customer_registration_number', 'merchandize_category', 'balance_on_complaign_date', 
                        'transaction_date', 'complaint_date', 'complained', 'Order_tyPe', 'card_vendor', 'used_coupon', 'product_discounted'], axis=1)

In [None]:
df_to_train.info()

In [None]:
# Check for missing values
missing_values_count = df_to_train.isnull().sum()
missing_values_count

In [None]:
df_to_train = df_to_train.dropna()
missing_values_count = df_to_train.isnull().sum()
missing_values_count

In [None]:
df_to_train.head()

In [None]:
# Splitting our dataset to train and test
X_train, X_test, y_train, y_test = train_test_split(df_to_train, df_to_train["complained_num"], test_size=0.3, random_state=22)

X_train.groupby(['complained_num',]).size()

In [None]:
X_train = X_train.drop('complained_num', axis=1)
X_test = X_test.drop('complained_num', axis=1)

In [None]:

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]

max_features = [ 'sqrt']

max_depth = [2,8]

min_samples_split = [2, 5]

min_samples_leaf = [1, 2]

bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

In [None]:
rng = np.random.RandomState(54)
model_rf = RandomForestClassifier(random_state=rng)

In [None]:
rf_Grid = GridSearchCV(estimator = model_rf, param_grid = param_grid, cv = 3, verbose=0, n_jobs = 4)
rf_Grid.fit(X_train,y_train)

In [None]:
y_rf_pred = rf_Grid.predict(X_test)
print(classification_report(y_test,y_rf_pred))


In [None]:
print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')

In [None]:
xg_model = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)


parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

xg_classfier = GridSearchCV(
    estimator=xg_model,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 5,
    verbose=True
)

xg_classfier.fit(X_train,y_train)

In [None]:
y_xg_pred = xg_classfier.predict(X_test)
print(classification_report(y_test,y_xg_pred))

In [None]:
print (f'Train Accuracy - : {xg_classfier.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {xg_classfier.score(X_test,y_test):.3f}')