In [1]:
# importing the needed libraries 
import pandas as pd 
import numpy as np 

In [59]:
# loading the dataset 
df = pd.read_excel('FTTH-DataSet.xlsx')
df.head(4)

Unnamed: 0,ID,GOVERNORATE,Customer with orange_MONTHS,CUSTOMER_AGE_MONTHS,CUSTOMER_GENDER,COMMITMENT,COMMITMENT_FG,OF_SPEED,OF_PREV_SPEED,MIGRATION_FLAG,...,LAST_POWER_VALIDATION,LAST_LINK_PRIORITY,Disconnection_TOTAL_MAX_day,Disconnection_TOTAL_MIN_day,Disconnection_TOTAL_SUM_Month,Disconnection_TOTAL_MEAN_Month,GB_TOTAL_CONSUMPTION_Month1,GB_TOTAL_CONSUMPTION_Month2,GB_TOTAL_CONSUMPTION_Month3,TARGET
0,1,West Amman,48.741935,567.677419,M,24,1,200,100.0,y,...,Abnormal,Regular,1.0,1.0,32.0,1.0,645.685532,561.726552,519.477249,0
1,2,West Amman,44.83871,740.580645,M,24,0,100,100.0,y,...,,Regular,1.0,1.0,4.0,1.0,174.360611,159.508825,145.229521,0
2,3,West Amman,44.612903,531.096774,M,24,1,200,100.0,y,...,,Regular,2.0,1.0,8.0,1.333333,299.379466,319.849905,257.353694,0
3,4,Balqa,43.741935,645.612903,M,24,0,200,100.0,y,...,,Regular,1.0,1.0,6.0,1.0,477.543451,791.806873,569.29984,0


In [60]:
# let's perform the needed cleansing: 
# importing the needed libraries
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# let's define a function that perform the data cleaning steps
def clean(data):
    #lets's drop the columns that we think it is not needed
    data = data.drop(['ID', 'OF_PREV_SPEED', 'LAST_LINK_QUALITY', 'LAST_LINK_STATUS', 'LAST_POWER_VALIDATION', 'LAST_LINK_PRIORITY', 'Disconnection_TOTAL_MAX_day', 'Disconnection_TOTAL_MIN_day', 'Disconnection_TOTAL_SUM_Month', 'Disconnection_TOTAL_MEAN_Month'], axis=1)
    # we are going to fill the missing data with most common value for each categorical and numerical values    
    data = data.apply(lambda x: x.fillna(x.value_counts().index[0]))
    # we need to import preprocessing from scikit-learn library, to use label encoding for handling the categorical values
    label_encoder = preprocessing.LabelEncoder()
    # applying the label encoding for the following 3 features that have categorical values 
    data['GOVERNORATE'] = label_encoder.fit_transform(data['GOVERNORATE'])
    data['CUSTOMER_GENDER'] = label_encoder.fit_transform(data['CUSTOMER_GENDER'])
    data['MIGRATION_FLAG'] = label_encoder.fit_transform(data['MIGRATION_FLAG'])
    # let's define a variable that stores all the columns we are in need to handle its scale
    columns_to_scale = ['GOVERNORATE', 'Customer with orange_MONTHS', 'CUSTOMER_AGE_MONTHS','CUSTOMER_GENDER', 'COMMITMENT', 'COMMITMENT_FG', 'OF_SPEED','MIGRATION_FLAG', 'GB_TOTAL_CONSUMPTION_Month1','GB_TOTAL_CONSUMPTION_Month2', 'GB_TOTAL_CONSUMPTION_Month3', 'TARGET']
    # we are going to use MinMaxScaler to handle the distribution of the data
    # we need to import its library
    # let's define another variable that store the MinMaxScaler function
    scaler = MinMaxScaler()
    # applying scaler to the defined columns
    data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])
    # one-hot encode the categorical features
    ##data = pd.get_dummies(data, columns=['GOVERNORATE', 'CUSTOMER_GENDER', 'MIGRATION_FLAG'])
    # Separate the target variable from the features
    X = data.drop('TARGET', axis=1)
    y = data['TARGET']

    return data

In [61]:
df = clean(df)

In [62]:
df.shape

(94478, 12)

In [63]:
df.head(4)

Unnamed: 0,GOVERNORATE,Customer with orange_MONTHS,CUSTOMER_AGE_MONTHS,CUSTOMER_GENDER,COMMITMENT,COMMITMENT_FG,OF_SPEED,MIGRATION_FLAG,GB_TOTAL_CONSUMPTION_Month1,GB_TOTAL_CONSUMPTION_Month2,GB_TOTAL_CONSUMPTION_Month3,TARGET
0,0.9,0.957929,0.049017,0.5,0.5,1.0,0.175258,1.0,0.062382,0.053547,0.04868,0.0
1,0.9,0.879612,0.063973,0.5,0.5,0.0,0.072165,1.0,0.016846,0.015205,0.013609,0.0
2,0.9,0.875081,0.045853,0.5,0.5,1.0,0.175258,1.0,0.028924,0.03049,0.024116,0.0
3,0.1,0.857605,0.055758,0.5,0.5,0.0,0.175258,1.0,0.046137,0.075479,0.053349,0.0


In [64]:
df.columns

Index(['GOVERNORATE', 'Customer with orange_MONTHS', 'CUSTOMER_AGE_MONTHS',
       'CUSTOMER_GENDER', 'COMMITMENT', 'COMMITMENT_FG', 'OF_SPEED',
       'MIGRATION_FLAG', 'GB_TOTAL_CONSUMPTION_Month1',
       'GB_TOTAL_CONSUMPTION_Month2', 'GB_TOTAL_CONSUMPTION_Month3', 'TARGET'],
      dtype='object')

In [65]:
df.describe

<bound method NDFrame.describe of        GOVERNORATE  Customer with orange_MONTHS  CUSTOMER_AGE_MONTHS  \
0              0.9                     0.957929             0.049017   
1              0.9                     0.879612             0.063973   
2              0.9                     0.875081             0.045853   
3              0.1                     0.857605             0.055758   
4              0.9                     0.813592             0.026882   
...            ...                          ...                  ...   
94473          1.0                     0.003883             0.020790   
94474          0.2                     0.345631             0.043688   
94475          0.9                     0.239482             0.074384   
94476          0.9                     0.170227             0.028824   
94477          0.2                     0.408414             0.058169   

       CUSTOMER_GENDER  COMMITMENT  COMMITMENT_FG  OF_SPEED  MIGRATION_FLAG  \
0                  0.5

In [66]:
df.dtypes

GOVERNORATE                    float64
Customer with orange_MONTHS    float64
CUSTOMER_AGE_MONTHS            float64
CUSTOMER_GENDER                float64
COMMITMENT                     float64
COMMITMENT_FG                  float64
OF_SPEED                       float64
MIGRATION_FLAG                 float64
GB_TOTAL_CONSUMPTION_Month1    float64
GB_TOTAL_CONSUMPTION_Month2    float64
GB_TOTAL_CONSUMPTION_Month3    float64
TARGET                         float64
dtype: object

###### Now we are ready to split the data into three datasets, trian, validate, and test. Splitting the data after the previous steps enusre that no biase could appear , and then we are ready to perform ML step

In [67]:
# importing the needed library
from sklearn.model_selection import train_test_split
# splitting the datasest into 3 datasets: train, validate, and test
# the stratify parameter is used to ensure that the target  is evenly distributed.
train, test = train_test_split(df, test_size=.2, random_state=42, stratify=df['TARGET'])
train, val = train_test_split(train, test_size=.1, random_state=42, stratify=train['TARGET'])

In [68]:
#Let's output the shapes of the three datasets: training, validation, and testing :)
print('Training Dataset Shape is:', train.shape, '\n','Validation Dataset Shape is:' , val.shape,'\n','Testing Dataset Shape is:' , test.shape)

Training Dataset Shape is: (68023, 12) 
 Validation Dataset Shape is: (7559, 12) 
 Testing Dataset Shape is: (18896, 12)


###### Splitting the datasets ensures that no DATA LEAKAGE may occur, PLUS we need to drop the 'TARGET 'column before saving the csv files as follows:

In [69]:
val = val.drop(['TARGET'], axis = 1)
test = test.drop(['TARGET'], axis = 1 )

In [70]:
print(val.shape, test.shape)

(7559, 11) (18896, 11)


In [42]:
#train.to_csv('train.csv', index = False)
#val.to_csv('val.csv', index = False)
#test.to_csv('test.csv', index = False)

In [71]:
train.column

Index(['GOVERNORATE', 'Customer with orange_MONTHS', 'CUSTOMER_AGE_MONTHS',
       'CUSTOMER_GENDER', 'COMMITMENT', 'COMMITMENT_FG', 'OF_SPEED',
       'MIGRATION_FLAG', 'GB_TOTAL_CONSUMPTION_Month1',
       'GB_TOTAL_CONSUMPTION_Month2', 'GB_TOTAL_CONSUMPTION_Month3', 'TARGET'],
      dtype='object')

In [75]:
# importing the needed libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# let's define the y to our prediction trget
y = train['TARGET']
# let's define the X to our features after selection and engineering 
X = train.drop(['TARGET'], axis = 1)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = model.predict(X_val)

# Evaluate the performance of the model using various metrics
print("Accuracy:", accuracy_score(y_val, val_predictions))
print("Precision:", precision_score(y_val, val_predictions, zero_division=1))
print("Recall:", recall_score(y_val, val_predictions, zero_division=1))
print("F1-score:", f1_score(y_val, val_predictions, zero_division=1))

# Use cross-validation to get a more accurate estimate of the model's performance
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Accuracy: 0.8278946006873869
Precision: 0.8248800295311923
Recall: 0.830212513003418
F1-score: 0.8275376809984076
Cross-validation scores: [0.99448732 0.99448732 0.99441382 0.99448692 0.99448692]
Mean cross-validation score: 0.9944724582702127
