In [1]:
# importing needed libraries
import numpy as np 
import pandas as pd

In [2]:
# loading the data
# note that this project can't be reproducable, due to the privacy policy of publishing this dtaset! 
df = pd.read_excel('FTTH-DataSet.xlsx')
df.head(4)

Unnamed: 0,ID,GOVERNORATE,Customer with orange_MONTHS,CUSTOMER_AGE_MONTHS,CUSTOMER_GENDER,COMMITMENT,COMMITMENT_FG,OF_SPEED,OF_PREV_SPEED,MIGRATION_FLAG,...,LAST_POWER_VALIDATION,LAST_LINK_PRIORITY,Disconnection_TOTAL_MAX_day,Disconnection_TOTAL_MIN_day,Disconnection_TOTAL_SUM_Month,Disconnection_TOTAL_MEAN_Month,GB_TOTAL_CONSUMPTION_Month1,GB_TOTAL_CONSUMPTION_Month2,GB_TOTAL_CONSUMPTION_Month3,TARGET
0,1,West Amman,48.741935,567.677419,M,24,1,200,100.0,y,...,Abnormal,Regular,1.0,1.0,32.0,1.0,645.685532,561.726552,519.477249,0
1,2,West Amman,44.83871,740.580645,M,24,0,100,100.0,y,...,,Regular,1.0,1.0,4.0,1.0,174.360611,159.508825,145.229521,0
2,3,West Amman,44.612903,531.096774,M,24,1,200,100.0,y,...,,Regular,2.0,1.0,8.0,1.333333,299.379466,319.849905,257.353694,0
3,4,Balqa,43.741935,645.612903,M,24,0,200,100.0,y,...,,Regular,1.0,1.0,6.0,1.0,477.543451,791.806873,569.29984,0


###### it's a good practice to split the dataset into training, validation, and testing sets before applying any data analysis or modeling techniques. This helps to evaluate the performance of the model on unseen data and avoid overfitting

In [3]:
# importing the needed library
from sklearn.model_selection import train_test_split
# splitting the datasest into 3 datasets: train, validate, and test
# the stratify parameter is used to ensure that the target  is evenly distributed.
train, test = train_test_split(df, test_size=.2, random_state=42, stratify=df['TARGET'])
train, val = train_test_split(train, test_size=.1, random_state=42, stratify=train['TARGET'])

In [4]:
#Let's output the shapes of the three datasets: training, validation, and testing :)
print('Training Dataset Shape is:', train.shape, '\n','Validation Dataset Shape is:' , val.shape,'\n','Testing Dataset Shape is:' , test.shape)

Training Dataset Shape is: (68023, 22) 
 Validation Dataset Shape is: (7559, 22) 
 Testing Dataset Shape is: (18896, 22)


In [5]:
#Let's Save the training, validation, and testing datasets to separate CSV files :)
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)

In [6]:
#Loading the training, validation, and testing datasets from separate CSV files generated previously
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')
test = pd.read_csv('test.csv')

In [9]:
# importing the needed libraries
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
# let's define a function that perform the data cleaning steps
def clean (data):
    #lets's drop the columns that we think it is not needed
    data = data.drop(['ID', 'OF_PREV_SPEED', 'LAST_LINK_QUALITY', 'LAST_LINK_STATUS', 'LAST_POWER_VALIDATION', 'LAST_LINK_PRIORITY', 'Disconnection_TOTAL_MAX_day', 'Disconnection_TOTAL_MIN_day', 'Disconnection_TOTAL_SUM_Month', 'Disconnection_TOTAL_MEAN_Month'], axis=1)
    # we are going to fill the missing data with most common value for each categorical and numerical values    
    data =data.apply(lambda x: x.fillna(x.value_counts().index[0]))
    # we need to import preprocessing fron sciket learn library, to use label encoding for handling the categorical values
    label_encoder = preprocessing.LabelEncoder()
    # applying the label encoding for the follwing 3 features that have categorical values 
    data['GOVERNORATE']= label_encoder.fit_transform(data['GOVERNORATE'])
    data['CUSTOMER_GENDER']= label_encoder.fit_transform(data['CUSTOMER_GENDER'])
    data['MIGRATION_FLAG']= label_encoder.fit_transform(data['MIGRATION_FLAG'])
    # let's define a variable that stores all the columns we are in need to handle its scale
    columns_to_scale = ['GOVERNORATE', 'Customer with orange_MONTHS', 'CUSTOMER_AGE_MONTHS','CUSTOMER_GENDER', 'COMMITMENT', 'COMMITMENT_FG', 'OF_SPEED','MIGRATION_FLAG', 'GB_TOTAL_CONSUMPTION_Month1','GB_TOTAL_CONSUMPTION_Month2', 'GB_TOTAL_CONSUMPTION_Month3', 'TARGET']
    # we are going to use MinMaxScaler to handle the distribution of the data
    # we need to import its library
    # let's define another varabile that store the MinMaxScaler function
    scaler = MinMaxScaler()
    # applying scaler to the defined columns
    data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])
    return data

In [10]:
# let's apply the cleaning to the all of the 3 datasets each separatly :)
train = clean(train)
val = clean(val)
test = clean(test)

In [11]:
# importing needed libraries for Random Forest Algorithim
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def rf (data):
    # let's strt to deal with the IMBALANCE we have
    # the following code counts the number of churned customers(class 1), and the number of un-churned custoemrs( class 0 )
    count_class_0, count_class_1 = data.TARGET.value_counts()
    data_class_0 = data[data['TARGET'] == 0]
    data_class_1 = data[data['TARGET'] == 1]
    # we are going to apply the oversampling technique to class 1 in order to deal with the Imbalance
    data_class_1_over = data_class_1.sample(count_class_0, replace=True) # replace = True , to prevent duplicated outputs
    data_test_over = pd.concat([data_class_0, data_class_1_over], axis=0)
    # let's define  X and y 
    X = data_test_over.drop('TARGET',axis= 1)
    y = data_test_over['TARGET']
    # let's split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y )
    model = RandomForestClassifier().fit(X_train, y_train)
    # let's predict the target
    y_pred = model.predict(X_test)
    # finally, let's output the f1-score , we used it because of the imbalance case of the dataset
    f1 = f1_score(y_test, y_pred)
    return f1

In [12]:
rf(train)

0.99996304371928

In [13]:
rf(val)

1.0

In [14]:
rf(test)

1.0

###### We can  use the same custom functions defined to predict any other models , by redefining the model variable and importing the needed libraries first.

###### you can find more repositories in my growing up GitHub Profile :) 
https://github.com/laithrasheed