In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


# Ingoring warnings.
import warnings
warnings.filterwarnings('ignore')


# Reading the data and printing first 5 rows.
url = 'https://drive.google.com/file/d/1EW-XMnGfxn-qzGtGPa3v_C63Yqj2aGf7/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
df.head()

In [None]:
# Implementing encoder to turn all values into unique numeric values, to ensure that we do not get a data type error.
le = LabelEncoder()


# Using the encoder and creating the new encoded table.
# Same values of a column in the original table get the same numeric values in new table.
labeled = df.apply(le.fit_transform)


# Seperating first 13 columns as features and the last column as the criteria output.
X = labeled.iloc[:, 0:13]
y = labeled.iloc[:, 13]


# Applying MinMax scaling to normalize the data set, to ensure that all features have same effect on the final result.
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)


# Printing first 5 rows of the encoded table to show what it looks like.
labeled.head()

# In the first row of encoded table, the value of "age" column "39" does not represent the age "39".
# It represents the encoded value of the age "58", which is the "age" value of the first row in the original table.

In [None]:
# Creating an empty list to append the metric values later on.
scores = []


# Using Random Forest Classifier for the evaluation.
rf = RandomForestClassifier()


# Using 5-fold cross validation as told in the task explanation, which splits the data set into 5 folds.
cv = KFold(n_splits=5, random_state=42, shuffle=False)


# Printing train and test indexes which are specified just below.
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index)
    print("Test Index: ", test_index, "\n")
    
# Specifing the training and testing sets from the features to be used in each iteration and using them in the model.
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    rf.fit(X_train, y_train)
    scores.append(rf.score(X_test, y_test))

In [None]:
# Computing and appending the scores into the "scores" list.
rf.fit(X_train, y_train)
scores.append(rf.score(X_test, y_test))

In [None]:
# Getting the mean value of the "scores" list to get the overall accuracy of the model, and printing this value.
print(np.mean(scores))

In [None]:
# BONUSES

In [None]:
# Creating a new table with the customers who subscribed to a term deposit.
customers = df.loc[df['y'] == "yes"]


# Printing the most popular values of the columns.
for i in customers.columns:
    print("Most popular", i, "of subscribers and their count: " , customers[i].value_counts().head(1),'\n')

    
# The client could prioritize these customer segments since they are the most popular subscriber features.

In [None]:
# Getting importance values.
importance = rf.feature_importances_


# Printing importance of every feature.
for i,v in enumerate(importance):
    print('Feature(Column) %0d: Score: %.5f' % (i,v))
    
    
# Plotting the feature importance.
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()


# Here we can see that feature 11(duration) has the greatest importance, so this feature could be focused on more than others.