# The Objective is to develop a model that analyzes Magaline subscriber behavior to recommend either smart or ultra plan.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Loading Data

In [2]:
df = pd.read_csv('../Sprint_11/users_behavior.csv'
                ,dtype = {'is_ultra':'category'})
df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


## Preprocessing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   calls     3214 non-null   float64 
 1   minutes   3214 non-null   float64 
 2   messages  3214 non-null   float64 
 3   mb_used   3214 non-null   float64 
 4   is_ultra  3214 non-null   category
dtypes: category(1), float64(4)
memory usage: 103.7 KB


In [4]:
df.isna().sum()

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64

In [5]:
df.duplicated().sum()

0

## Let's start with DecisionTreeClassifier model

In [6]:
X = df.drop(['is_ultra'], axis = 1)
y = df['is_ultra']

In [7]:
X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=12345)

In [8]:
print('X_train is :', round(((X_train.shape)[0])/(X.shape)[0] * 100 ,2), '% of X')
print('X_val is :', round(((X_val.shape)[0])/(X.shape)[0] * 100 ,2), '% of X')
print('X_test is :', round(((X_test.shape)[0])/(X.shape)[0] * 100 ,2), '% of X')

X_train is : 63.97 % of X
X_val is : 16.02 % of X
X_test is : 20.01 % of X


In [9]:
model = DecisionTreeClassifier(random_state=12345)

In [10]:
model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=12345)

In [11]:
y_pred = model.predict(X_val)

In [12]:
acc = model.score(X_val, y_val)
print('Accuracy :', acc)

Accuracy : 0.7106796116504854


## Does the amount of mb_used determine plan?

In [13]:
df['mb_used'].describe()

count     3214.000000
mean     17207.673836
std       7570.968246
min          0.000000
25%      12491.902500
50%      16943.235000
75%      21424.700000
max      49745.730000
Name: mb_used, dtype: float64

In [14]:
df['is_ultra'].value_counts()

0    2229
1     985
Name: is_ultra, dtype: int64

In [16]:
for depth in range(1, 6):
    model = DecisionTreeClassifier(random_state=12345, max_depth=depth)
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_val)
    print("max_depth =", depth, ": ", end='')
    print(accuracy_score(y_val, predictions_valid))

max_depth = 1 : 0.7223300970873786
max_depth = 2 : 0.7475728155339806
max_depth = 3 : 0.7553398058252427
max_depth = 4 : 0.7533980582524272
max_depth = 5 : 0.7572815533980582


## Random Forest Classifier:

In [None]:
best_score = 0
best_est = 0

for est in range(1, 11):
    RFC_model = RandomForestClassifier(random_state=12345, n_estimators=est)
    
    RFC_model.fit(features_train, target_train)
    
    score = RFC_model.score(features_valid, target_valid)
    
    if score > best_score:
        best_score = score
        best_est = est
        
print("Accuracy of the best model on the validation set (n_estimators = {}):{}".format(best_est, best_score))

In [None]:
val_best_score = 0
val_best_est = 0

for est in range(1, 11):
    val_RFC_model = RandomForestClassifier(random_state=12345, n_estimators=est)
    
    val_RFC_model.fit(X_train, y_train)
    
    val_score = val_RFC_model.score(X_val, y_val)
    
    if val_score > val_best_score:
        val_best_score = val_score
        val_best_est = est
        
print("Accuracy of best model on validation set (n_estimators = {}){}".format(val_best_est, val_best_score))

In [None]:
test_RFC = RandomForestClassifier(random_state=12345, n_estimators=7)

test_RFC.fit(X_train, y_train)

RFC_score = test_RFC.score(X_test, y_test)

print("Accuracy of the Random Forest Classifier model using 7 estimators: {}".format(RFC_score))

## Conclusion: we can predict with 77% accuracy which new line subscribers should belong to.