In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_file = 'train.csv'
test_file = 'test.csv'
data_dict = 'data_dict.csv'

In [None]:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
data_dict = pd.read_csv(data_dict)

In [None]:
x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1:]

x_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1:]
print(x_test.shape)
print(x_train.shape)

In [None]:
def get_unique_value(data):
    
    unique_dict = {}
    
    for col in data.columns:
        unique_dict[col] = data[col].unique()
    
    return unique_dict

In [None]:
get_unique_value(train_data)

In [None]:
data_dict

In [None]:
def get_type_var(data, type_col_name="Type", var_col_name="Variable"):
    
    numeric_list = []
    categorical_list = []

    for index, row in data.iterrows():
        if row[type_col_name] == "Numeric":
            numeric_list.append(row[var_col_name])
        else:
            categorical_list.append(row[var_col_name])
    
    return numeric_list, categorical_list        

In [None]:
numeric_features, categorical_features = get_type_var(data_dict)
categorical_features

In [None]:
train_data[numeric_features].describe()

In [None]:
train_data[categorical_features].describe()

# Visualisation

In [None]:
# visualise distribution
def plot_distribution(feature):
    sns.set_style("ticks")
    s = sns.FacetGrid(train_data, hue="churn", aspect=2.5, palette={1: 'Lightblue', 0: "pink"})
    s.map(sns.kdeplot, feature, shade=True, alpha=0.8)
    s.set(xlim=(0, train_data[feature].max()))
    s.add_legend()
    s.set_axis_labels(feature, "Proportion")
    plt.show()

In [None]:
for feature in numeric_features:
    plot_distribution(feature)

In [None]:
def categorical_visualisation(feature, churn_col_name="churn"):

    index_value = train_data[feature].unique().tolist()

    churn = []
    stay = []

    for value in index_value:
        col_data = train_data.loc[train_data[feature] == value][churn_col_name]

        churn.append(sum(col_data))
        stay.append(len(col_data) - sum(col_data))

    new_df = pd.DataFrame({"churn": churn, "stay": stay}, index=index_value)
    
    new_df.plot.bar(rot=0)
    
    plt.title("Churn rate by " + feature_col)
    plt.show()

In [None]:
for feature in categorical_features:
    categorical_visualisation(feature)

# Feature Engineering

to be updated

# Modelling

In [None]:
# Split ratio train_data : val_data
split_ratio = 0.8

split_data = np.split(train_data, [int(train_data.shape[0] * 0.8)], axis=0)

train_data_n = split_data[0]
val_data = split_data[1]

print(train_data.shape)
print(val_data.shape)

In [None]:
# Import library
from sklearn.ensemble import RandomForestClassifier

# Build model

X = pd.get_dummies(train_data_n.iloc[:, :-1])
y = train_data_n["churn"]

model = RandomForestClassifier(n_estimators=400, max_depth=5, random_state=1)
model.fit(X, y)

print(".... Finish training model.")

In [None]:
X_val = pd.get_dummies(val_data)
y_val = val_data["churn"].tolist()

predictions = model.predict(X_val)
pred_list = predictions.tolist()

In [None]:
y_true = []
y_pred = []

accurate_pred = 0
total_pred = 0

for i in range(val_data.shape[0]):
    if pred_list[i] == y_val[i]:
        accurate_pred += 1
    
    total_pred += 1
    
    y_true.append(y_val[i])
    y_pred.append(pred_list[i])

print("The percentage of correct predictions is: ", accurate_pred / total_pred * 100)

In [None]:
# Confusion matrix
# Import confusion_matrix lib
from sklearn.metrics import confusion_matrix

# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)

# Visualise confusion matrix
df_cm = pd.DataFrame(cf_matrix, index = [i for i in range(2)], columns = [i for i in range(2)])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)

In [None]:
# test data
X_test = pd.get_dummies(x_test)
y_test = y_test.values.tolist()

predictions_test = model.predict(X_test)
pred_list_test = predictions_test.tolist()