# INTRO:The purpose of this model is to analyze subscribers' behavior and recommend one of Megaline's newer plans: Smart or Ultra. The accuracy must be above 0.75.


In [1]:
#import data to start model

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.model_selection import train_test_split


In [3]:
usersdf = pd.read_csv('users_behavior.csv')


In [4]:
#familiarize yourself with the data

usersdf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [5]:
usersdf.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [6]:
#find missing data
missing_is_ultra = usersdf['is_ultra'].isnull().sum()
print(f"Number of missing values in 'is_ultra': {missing_is_ultra}")


Number of missing values in 'is_ultra': 0


In [7]:
#find missing data in entire df

missing_values = usersdf.isnull().sum()
print(missing_values)

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64


In [8]:
# statiscal data for behavior

usersdf.describe()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
count,3214.0,3214.0,3214.0,3214.0,3214.0
mean,63.038892,438.208787,38.281269,17207.673836,0.306472
std,33.236368,234.569872,36.148326,7570.968246,0.4611
min,0.0,0.0,0.0,0.0,0.0
25%,40.0,274.575,9.0,12491.9025,0.0
50%,62.0,430.6,30.0,16943.235,0.0
75%,82.0,571.9275,57.0,21424.7,1.0
max,244.0,1632.06,224.0,49745.73,1.0


In [9]:
#find missing data
missing_is_ultra = usersdf['is_ultra'].isnull().sum()
print(f"Number of missing values in 'is_ultra': {missing_is_ultra}")


Number of missing values in 'is_ultra': 0


In [10]:
#find missing data in entire df

missing_values = usersdf.isnull().sum()
print(missing_values)

calls       0
minutes     0
messages    0
mb_used     0
is_ultra    0
dtype: int64


In [11]:




# Splitting features and target variable
X = usersdf.drop('is_ultra', axis=1)  # Features
y = usersdf['is_ultra']  # Target

# First split: Train (75%) and Temp (25%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.25, random_state=12345)




In [12]:
# Second split: Temp (25%) into Validation (50% of Temp) and Test (50% of Temp)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12345)



In [13]:
# Display the sizes of each split
print("Training Set Size:", len(X_train))
print("Validation Set Size:", len(X_val))
print("Test Set Size:", len(X_test))

Training Set Size: 2410
Validation Set Size: 402
Test Set Size: 402


# 3. Investigate the quality of different models by changing hyperparameters. Briefly describe the findings of the study.


In [14]:
log_reg = LogisticRegression(random_state=12345, solver='liblinear', max_iter=1000)




In [15]:
# Train the model on the training set
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = log_reg.predict(X_val)


In [16]:


# Evaluate the model
accuracy1 = accuracy_score(y_val, y_val_pred)


# Print results
print("Accuracy:", accuracy1)

Accuracy: 0.7661691542288557


In [17]:
# Initialize results dictionary
results = {}

In [18]:
def evaluate_model(model, X_val, y_val):
    y_val_pred = model.predict(X_val)
    accuracy= accuracy_score(y_val, y_val_pred)
  
    return {"Accuracy": accuracy}


In [19]:
# 2. Random Forest
rf_params = {'n_estimators': [50, 100, 200],
             'max_depth': [5, 10, None],
             'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=12345), rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Evaluate Random Forest
results['Random Forest'] = evaluate_model(rf_grid.best_estimator_, X_val, y_val)

In [20]:
print(f"Random Forest: {rf_grid.best_params_}")

Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}


In [21]:
results

{'Random Forest': {'Accuracy': 0.8109452736318408}}

# Random Forest Accuracy  0.8109452736318408
# Logistic Regression Accuracy: 0.7761194029850746

In [22]:
# Evaluate the best models on the test set
final_results = {}



In [23]:
# 2. Random Forest
final_model = RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50)
final_model.fit(X_train, y_train)

# Evaluate Random Forest
results['Random Forest'] = evaluate_model(rf_grid.best_estimator_, X_test, y_test)



In [26]:
# Evaluate the best models on the test set
final_results = {}


In [27]:

# Random Forest
final_results['Random Forest'] = evaluate_model(rf_grid.best_estimator_, X_test, y_test)



# Display Final Results on Test Set
print("\nTest Set Evaluation:")
for model_name, metrics in final_results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")


Test Set Evaluation:

Random Forest:
Accuracy: 0.7985


# We see our test results are slightly different from the training model.Our Logistic regression model is gicing us an accuracy of .7438 while our Random Forest is giving us a accuracy of 0.7985.


# sanity check the model

In [62]:
# Train set performance
y_train_pred = log_reg.predict(X_train)

train_accuracy = accuracy_score(y_train, y_train_pred)



In [63]:
# Validation set performance
y_val_pred = log_reg.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)



In [64]:
# Test set performance
y_test_pred = log_reg.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("⚠️ Warning: Possible overfitting detected! The model performs much better on training data than test data.")

Training Accuracy: 0.7469
Validation Accuracy: 0.7761
Test Accuracy: 0.7438


# Our sanity check passed with the accuracies being inside of 10% from each other.

# Sanity check for the Random forest model

In [65]:
# Validation set performance
y_val_pred = rf_grid.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

# Test set performance
y_test_pred = rf_grid.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("⚠️ Warning: Possible overfitting detected! The model performs much better on training data than test data.")

Training Accuracy: 0.7469
Validation Accuracy: 0.8109
Test Accuracy: 0.7985


# Random forest model passes the sanity check with the difference in validation and test being under 10%. I recommend using the Random forest as it has a higher percentage of accuracy.

# Conclusion:Model Performance:The model achieved a training accuracy of 75%, a validation accuracy of 81%, and a test accuracy of 79%.The model achieved a training accuracy of 75%, a validation accuracy of 81%.The model’s test performance is comparable to the validation performance, suggesting that it generalizes well to unseen data. It seems like overfitting is not an issue. Taking that into consideration I would deploy this model to help phone plan users.
  