# || Import Library ||

In [345]:
import glob
import random
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# || Supporting Methods ||

In [346]:
def get_utilization_result(current_row):
    try:
        if current_row.between(90, 100).all():
            return 1
        elif current_row.between(1, 49).all():
            return -1
        else:
            return 0
    except:
        return random.choice([1, -1, 0])


In [347]:
new_cols = ["Iteration%s" % x for x in range(1,21)]
def generate_input_file(smart_manager_path):
    all_files = glob.glob(os.path.join(smart_manager_path, "sample-data/*.csv"))
    final_file = os.path.join(smart_manager_path, "sample-data/sample_input.csv")
    all_files = [x for x in all_files if x != final_file]
    new_df = pd.DataFrame(columns=new_cols)
    for each_file_path in all_files:
        df = pd.read_csv(each_file_path)
        for each_col in list(df.columns)[1:]:
            tmp_df = pd.DataFrame(df[each_col].values.reshape(-1, 20), columns=new_cols)
            new_df = pd.concat([new_df, tmp_df])
    range_dict = [(90, 101, 1500), (50, 90, 1003), (1, 50, 1200), (35, 65, 900), (75, 99, 1700), (0, 110, 937)]
    for i_ in range(len(range_dict)):
        lower_limit, upper_limit, no_of_records = range_dict[i_]
        tmp_df = pd.DataFrame(np.random.randint(lower_limit, upper_limit, size=(no_of_records, 20)), columns=new_cols)
        new_df = pd.concat([new_df, tmp_df])
        mixed_choice = list(range(lower_limit, upper_limit)) + list("GAR_B AGE") + [None, " "]
        tmp_df2 = pd.DataFrame(np.random.choice(mixed_choice, size=(300, 20)), columns=new_cols)
        new_df = pd.concat([new_df, tmp_df2])
    new_df = new_df.sample(frac=1)
    new_df['utilisation'] = new_df.apply(lambda row: get_utilization_result(row), axis=1)
    new_df.to_csv(final_file, index=False)
    print("Writing Completed")

# generate_input_file(smart_manager_path="/content/drive/MyDrive/smart_manager")

Writing Completed


# || Analyzing Data || Read the Exported CSV file ||

In [348]:
final_file = '/content/drive/MyDrive/smart_manager/sample-data/sample_input.csv'
df = pd.read_csv(final_file)
df

Unnamed: 0,Iteration1,Iteration2,Iteration3,Iteration4,Iteration5,Iteration6,Iteration7,Iteration8,Iteration9,Iteration10,Iteration11,Iteration12,Iteration13,Iteration14,Iteration15,Iteration16,Iteration17,Iteration18,Iteration19,Iteration20,utilisation
0,52,68,20,85,55,98,100,52,96,54,44,35,39,62,92,80,109,98,31,77,0
1,95.0,95.0,97.0,90.0,90.0,93.0,90.0,90.0,95.0,91.0,93.0,99.0,100.0,100.0,92.0,97.0,94.0,91.0,92.0,93.0,1
2,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,-1
3,6.2,5.1,5.1,5.0,5.6,6.1,7.4,5.1,4.7,5.1,5.5,5.1,8.0,5.0,5.1,5.1,5.2,5.2,7.9,4.9,-1
4,23,10,17,36,45,73,41,56,9,23,,51,24,G,100,40,44,3,36,65,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12635,45,44,39,38,63,45,51,43,55,46,51,60,63,50,43,47,62,50,45,47,0
12636,9,38,2,32,49,44,39,36,21,9,19,7,36,20,43,15,23,8,47,8,-1
12637,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,0
12638,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,54.9,0


# || Data Wrangling is to remove null or [ _ ] or empty data, cleaning data set ||

In [349]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12640 entries, 0 to 12639
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Iteration1   12595 non-null  object
 1   Iteration2   12601 non-null  object
 2   Iteration3   12605 non-null  object
 3   Iteration4   12594 non-null  object
 4   Iteration5   12590 non-null  object
 5   Iteration6   12602 non-null  object
 6   Iteration7   12593 non-null  object
 7   Iteration8   12597 non-null  object
 8   Iteration9   12599 non-null  object
 9   Iteration10  12590 non-null  object
 10  Iteration11  12599 non-null  object
 11  Iteration12  12601 non-null  object
 12  Iteration13  12584 non-null  object
 13  Iteration14  12596 non-null  object
 14  Iteration15  12594 non-null  object
 15  Iteration16  12596 non-null  object
 16  Iteration17  12598 non-null  object
 17  Iteration18  12589 non-null  object
 18  Iteration19  12588 non-null  object
 19  Iteration20  12601 non-nu

In [350]:
print("Before Cleanups")
df.isnull().sum()

Before Cleanups


Iteration1     45
Iteration2     39
Iteration3     35
Iteration4     46
Iteration5     50
Iteration6     38
Iteration7     47
Iteration8     43
Iteration9     41
Iteration10    50
Iteration11    41
Iteration12    39
Iteration13    56
Iteration14    44
Iteration15    46
Iteration16    44
Iteration17    42
Iteration18    51
Iteration19    52
Iteration20    39
utilisation     0
dtype: int64

# || DROP Rows containing SPACE, EMPTY-STRINGS, UNDERSCORE, GARBAGE-CHARACTERS ||


### UNDERSCORE DROPS

In [351]:
old = int(df.shape[0])
print("Rows Before UNDERSCORE Drops: %s" % df.shape[0])
for each_col in new_cols:
    df = df[df[each_col] != "_"]
print("Rows After UNDERSCORE Drops: %s" % df.shape[0])
print("Total Rows Dropped with UNDERSCORE: %s" % int(old - df.shape[0]))

Rows Before UNDERSCORE Drops: 12640
Rows After UNDERSCORE Drops: 11969
Total Rows Dropped with UNDERSCORE: 671


### SPACE DROPS

In [352]:
old = int(df.shape[0])
print("Rows Before SPACE Drops: %s" % df.shape[0])
for each_col in new_cols:
    df = df[df[each_col] != " "]
print("Rows After SPACE Drops: %s" % df.shape[0])
print("Total Rows Dropped with SPACE: %s" % int(old - df.shape[0]))

Rows Before SPACE Drops: 11969
Rows After SPACE Drops: 11323
Total Rows Dropped with SPACE: 646


### NaN DROPS

In [353]:
print("Dropping NaN")
df[pd.isnull(df).any(axis=1)]

Dropping NaN


Unnamed: 0,Iteration1,Iteration2,Iteration3,Iteration4,Iteration5,Iteration6,Iteration7,Iteration8,Iteration9,Iteration10,Iteration11,Iteration12,Iteration13,Iteration14,Iteration15,Iteration16,Iteration17,Iteration18,Iteration19,Iteration20,utilisation
34,73,80,63,56,50,50,G,54,53,55,87,56,50,78,74,55,,73,,76,0
45,38,,59,55,43,36,59,47,59,39,57,45,49,51,37,53,59,59,44,53,0
50,91,74,54,100,,38,55,32,36,82,18,32,29,24,73,38,55,2,25,71,0
93,12,44,26,81,,90,84,104,3,6,49,79,55,97,53,57,31,26,95,26,0
104,98,95,91,,96,96,E,92,94,98,93,99,E,G,100,A,95,A,G,G,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12213,41,40,48,43,45,,R,A,59,56,R,46,45,47,53,A,52,A,63,59,0
12221,36,8,,62,106,17,46,87,31,40,41,57,60,93,66,1,23,90,80,21,0
12473,88,79,96,95,G,88,88,78,92,88,A,77,G,A,91,84,95,,81,97,-1
12613,32,A,R,,46,45,27,25,46,24,38,17,18,44,E,47,46,R,19,B,-1


In [354]:
old = int(df.shape[0])
print("Rows Before NaN Drops: %s" % df.shape[0])
df.dropna(subset=new_cols, axis=0, inplace=True)
print("Rows After NaN Drops: %s" % df.shape[0])
print("Total Rows Dropped with NaN: %s" % int(old - df.shape[0]))

Rows Before NaN Drops: 11323
Rows After NaN Drops: 11176
Total Rows Dropped with NaN: 147


In [355]:
print("After NA Drops")
df.isnull().sum()

After NA Drops


Iteration1     0
Iteration2     0
Iteration3     0
Iteration4     0
Iteration5     0
Iteration6     0
Iteration7     0
Iteration8     0
Iteration9     0
Iteration10    0
Iteration11    0
Iteration12    0
Iteration13    0
Iteration14    0
Iteration15    0
Iteration16    0
Iteration17    0
Iteration18    0
Iteration19    0
Iteration20    0
utilisation    0
dtype: int64

### DUPLICATE DROPS

In [356]:
old = int(df.shape[0])
print("Rows Before DUPLICATE Drops: %s" % df.shape[0])
df = df.drop_duplicates(subset=new_cols)
print("Rows After DUPLICATE Drops: %s" % df.shape[0])
print("Total DUPLICATE Rows Dropped : %s" % int(old - df.shape[0]))

Rows Before DUPLICATE Drops: 11176
Rows After DUPLICATE Drops: 9205
Total DUPLICATE Rows Dropped : 1971


### GARBAGE CHARACTER DROPS

In [357]:
old = int(df.shape[0])
print("Rows Before CHARACTER Drops: %s" % df.shape[0])
for each_col in new_cols:
    df = df[df[each_col].apply(lambda x: str(x).replace(".", "", 1).isdigit())]
print("Rows After CHARACTER Drops: %s" % df.shape[0])
print("Total Rows Dropped with CHARACTER: %s" % int(old - df.shape[0]))

Rows Before CHARACTER Drops: 9205
Rows After CHARACTER Drops: 8921
Total Rows Dropped with CHARACTER: 284


# || Convert Whole Data to Numerics ||

In [358]:
df = df.apply(pd.to_numeric, errors='raise')

# || Filter invalid Data of percentage more than 100% ||

In [359]:
old = int(df.shape[0])
print("Rows Before INVALID Drops: %s" % df.shape[0])
for each_col in new_cols:
    df = df[df[each_col].apply(lambda x: x <= 100)]
print("Rows After INVALID Drops: %s" % df.shape[0])
print("Total Rows Dropped with INVALID: %s" % int(old - df.shape[0]))

Rows Before INVALID Drops: 8921
Rows After INVALID Drops: 8091
Total Rows Dropped with INVALID: 830



# ||Train and Test ||


### Divide appropriate Columns

In [360]:
df.head(5)

Unnamed: 0,Iteration1,Iteration2,Iteration3,Iteration4,Iteration5,Iteration6,Iteration7,Iteration8,Iteration9,Iteration10,Iteration11,Iteration12,Iteration13,Iteration14,Iteration15,Iteration16,Iteration17,Iteration18,Iteration19,Iteration20,utilisation
1,95.0,95.0,97.0,90.0,90.0,93.0,90.0,90.0,95.0,91.0,93.0,99.0,100.0,100.0,92.0,97.0,94.0,91.0,92.0,93.0,1
2,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,13.4,-1
3,6.2,5.1,5.1,5.0,5.6,6.1,7.4,5.1,4.7,5.1,5.5,5.1,8.0,5.0,5.1,5.1,5.2,5.2,7.9,4.9,-1
5,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,1
6,36.0,50.0,49.0,52.0,37.0,37.0,40.0,53.0,37.0,54.0,64.0,55.0,44.0,51.0,59.0,56.0,59.0,37.0,62.0,36.0,0


In [361]:
X = df.drop("utilisation", axis=1)
y = df["utilisation"]

In [362]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

### MODEL FITTINGS

In [363]:
logmodel = LogisticRegression(max_iter=6000)
logmodel.fit(X_train, y_train)

LogisticRegression(max_iter=6000)

In [364]:
predictions = logmodel.predict(X_test)

In [365]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          -1       0.93      0.91      0.92       815
           0       0.91      0.96      0.93      1304
           1       0.92      0.84      0.88       552

    accuracy                           0.92      2671
   macro avg       0.92      0.90      0.91      2671
weighted avg       0.92      0.92      0.92      2671



In [366]:
confusion_matrix(y_test, predictions)

array([[ 742,   60,   13],
       [  29, 1247,   28],
       [  28,   62,  462]])

# || Accuracy CHECK ||

In [367]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.917633845001872