In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 50)

In [None]:
merged = pd.read_csv("data/merged.csv")

## CHECK FOR MISSING VALUES

In [None]:
merged.isnull().sum()
missing_values_df = pd.DataFrame(columns=['MissingVal%'], index=merged.columns)
for col in merged.columns:
    missing_values_df.loc[col, 'MissingVal%'] = np.ceil((merged[col].isnull().sum()/merged.shape[0]) * 100)

missing_values_df

## CHECKING FOR DATA TYPES

In [None]:
merged.info()

In [None]:
#remove the two rows of all NA's
merged=merged[pd.notnull(merged['national_inv'])]

In [None]:
quantvars = merged.select_dtypes(include=['float64']).columns.to_list()

cat_cols = merged.select_dtypes(include='object').columns.to_list()
cat_cols.remove('sku')
for category in cat_cols:
    print(f'👉 {category} has {merged[category].nunique()} values')

catpred = cat_cols.remove('went_on_backorder')

### Encoding Category columns

In [None]:
for category in cat_cols:
    merged[category] = merged[category].astype('category').cat.codes

In [None]:
merged['perf_12_month_avg'] = merged['perf_12_month_avg'].replace(-99, np.NaN)
merged['perf_6_month_avg'] = merged['perf_6_month_avg'].replace(-99, np.NaN)

merged.head(5)

### DESCRIPTIVE STATISTICS AND PLOT

In [None]:
merged[quantvars].describe().transpose()

In [None]:
fig, axs= plt.subplots(3, 3, figsize = (15, 6))
sns.scatterplot(x=merged.forecast_3_month, y=merged.sales_3_month, ax=axs[0, 0])
sns.scatterplot(x=merged.forecast_6_month, y=merged.sales_6_month, ax=axs[0, 1])
sns.scatterplot(x=merged.forecast_9_month, y=merged.sales_9_month, ax=axs[0, 2])
sns.scatterplot(x=merged.national_inv, y=merged.sales_1_month, ax=axs[1, 0])
sns.scatterplot(x=merged.national_inv, y=merged.sales_3_month, ax=axs[1, 1])
sns.scatterplot(x=merged.national_inv, y=merged.sales_6_month, ax=axs[1, 2])
sns.scatterplot(x=merged.national_inv, y=merged.sales_9_month, ax=axs[2, 0])
sns.scatterplot(x=merged.in_transit_qty, y=merged.national_inv, ax=axs[2, 1])
sns.scatterplot(x=merged.in_transit_qty, y=merged.lead_time, ax=axs[2, 2])

plt.savefig("../images/Scatter_plot_for_sales.png")

In [None]:
n_columns = 3
n_rows = 5
_, axs = plt.subplots(n_rows, n_columns, figsize=(8 * n_columns, 5 * n_rows))
for i, c in enumerate(quantvars):
    sns.boxplot(y= c, data= merged, ax= axs[i // n_columns, i % n_columns])

plt.title("Boxplot with Quantitative Columns")
plt.tight_layout()
plt.show()

plt.savefig("../images/Boxplot_quant_cols.png")

In [None]:
catpred = cat_cols
catpred

In [None]:
fig, axs = plt.subplots(3, 2, figsize = (10, 5))
# plt.subplots_adjust(hspace=0.1, wspace=0.4)
pd.crosstab(merged.stop_auto_buy, merged.went_on_backorder).plot(kind='bar', ax=axs[0, 0], title="Stop Auto Buy")
pd.crosstab(merged.potential_issue, merged.went_on_backorder).plot(kind='bar', ax=axs[0, 1], title="Potential Issue")
pd.crosstab(merged.oe_constraint, merged.went_on_backorder).plot(kind='bar', ax=axs[1, 0], title="OE Constraint")
pd.crosstab(merged.rev_stop, merged.went_on_backorder).plot(kind='bar', ax=axs[1, 1], title="Rev Stop")
pd.crosstab(merged.ppap_risk, merged.went_on_backorder).plot(kind='bar', ax=axs[2, 0], title="PPAP Risk")
pd.crosstab(merged.deck_risk, merged.went_on_backorder).plot(kind='bar', ax=axs[2, 1], title="Deck Risk")
plt.savefig("../images/Categories_with_targets.png")

### Correlation Plot