# Analysing Bananas telecom

In [None]:
# Load data
import pandas as pd
url ='https://raw.githubusercontent.com/ccolon/lepont/main/Bananas_telecom.xlsx'
df = pd.read_excel(url)
print(df.shape)
df.head()

# Profiling

Let's do some quick profiling

In [None]:
print(df.shape)
print(f"Number of unique client ids {df['Client_id'].nunique()}")

In [None]:
print(f"Unique values for Most frequently used handset: {df['Most frequently used handset 1st month'].unique()}")

In [None]:
df['Non_payers'].value_counts()

A useful module/library to make quick and interactive plots is **plotly.express**.

Doing plots is also part of profiling: it helps us **understand the data**

In [None]:
import plotly.express as px
fig = px.box(df, x="Non_payers", y="Final_Rate_Plan_Price")
fig.show()

In [None]:
fig = px.scatter(df, x="Terminal_Price", y="Final_Rate_Plan_Price", color="Non_payers")
fig.show()

In [None]:
fig = px.scatter(df, x="Age", y="Final_Rate_Plan_Price", color="Non_payers")
fig.show()

In [None]:
av_terminal_price_per_cat = df.groupby(["Non_payers", "Activation_Channel"],
                                       as_index=False)['Terminal_Price'].mean()
fig = px.line_polar(av_terminal_price_per_cat,
                    r="Terminal_Price", theta="Activation_Channel",
                    color="Non_payers", line_close=True, template="plotly_dark")
fig.show()

In [None]:
av_age_per_cat = df.groupby(["Non_payers", "Activation_Channel"],
                                       as_index=False)['Age'].mean()
fig = px.line_polar(av_age_per_cat,
                    r="Age", theta="Activation_Channel",
                    color="Non_payers", line_close=True, template="plotly_dark")
fig.show()

Do we have missing values? This is relevant for machine learning

In [None]:
df.isna().sum()

Let's see 5 randomly chosen rows with missing values

In [None]:
df.loc[df.isna().any(axis=1)].sample(5)

In [None]:
df.shape

For machine learning, we use **sklearn**

We will first try a simple strategy: we will drop ALL rows with missing values.
In practice, you might work with a small selection of columns: the column that you want to predict, and the columns that have predictive power.

In [None]:
# Remove null values (very rough)
df_red = df.dropna().reset_index(drop=True)
print(df_red.shape)

In [None]:
# Generate target (y)
target = df_red['Non_payers'].astype('category')

# Generate predictors (X)
predictors = df_red.drop(columns=['Non_payers', "Client_id"])


In [None]:
predictors.head(5)

In [None]:
df_red['Final_Rate_Plan'].unique()

In [None]:
target

In [None]:
predictors = pd.get_dummies(predictors)

In [None]:
predictors.head(5)

In [None]:
# Spliting arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
# i.e. 70 % training dataset and 30 % test datasets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.30)

In [None]:
X_train.head(5)

In [None]:
X_train.shape

In [None]:
predictors.shape

In [None]:
X_test.shape

In [None]:
# creating a RF classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100) 

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

In [None]:
# metrics are used to find accuracy or error
from sklearn import metrics 
 
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("PRECISION OF THE MODEL: ", metrics.precision_score(y_test, y_pred, pos_label="non payer"))

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
# using metrics module for confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# plot it
metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

![](https://i0.wp.com/dataaspirant.com/wp-content/uploads/2020/08/5_type_1_error.png?w=500&ssl=1)
![](https://i2.wp.com/dataaspirant.com/wp-content/uploads/2020/08/6_type_2_error.png?w=500&ssl=1)

Can we understand which features are predictive?

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
sel.fit(X_train, y_train)

In [None]:
px.scatter(sel.estimator_.feature_importances_, hover_data=[predictors.columns])

In [None]:
selected_features = predictors.columns[sel.estimator_.feature_importances_ > 0.02]

In [None]:
selected_features

In [None]:
df.columns

In [None]:
columns_selection = ['Total revenues 1st month',
                     'Cost_Of_Goods_Sold', 'Age',
                     'Terminal_Price',
                     'Duration of data usage 1st month',
                     'SMS sent 1st month',
                     'Number of calls received 1st month',
                     'Number of calls made 1st month',
                     'Final_Rate_Plan_Price',
                     'Subsidy_pct',
                     'Activation_Channel',
                     'District',
                    'Most frequently used handset 1st month',
                     'Final_Rate_Plan',
                    'Non_payers']

In [None]:
df_red2 = df[columns_selection].dropna()

In [None]:
# Generate target (y)
target2 = df_red['Non_payers'].astype('category')

# Generate predictors (X)
predictors2 = df_red.drop(columns=['Non_payers'])

predictors2 = pd.get_dummies(predictors)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(predictors2, target2, test_size = 0.30)

In [None]:
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("PRECISION OF THE MODEL: ", metrics.precision_score(y_test, y_pred, pos_label="non payer"))

In [None]:
# using metrics module for confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# plot it
metrics.ConfusionMatrixDisplay(confusion_matrix).plot()