# Define Functions

In [1]:
class Morris:
    def __init__(self, df, variables):
        self.df=df
        self.variables=variables 
        
    def multi_encode(self):
        from sklearn import preprocessing
        le = preprocessing.LabelEncoder()
        for variable in self.variables:
            self.df[variable] = le.fit_transform(self.df[variable])
            print(f"label encoded {variable}")
            
    def multi_drop(self):
        for variable in self.variables:
            self.df = self.df.drop(variable, axis=1)
            print(f"{variable} column is dropped")
        return self.df
    
    def multi_onehot(self):
        import pandas as pd
        for variable in self.variables:
            self.df = pd.get_dummies(self.df, prefix = [variable], columns = [variable])
            print(f"{variable} column is one hot encoded")
        print(self.df.shape)
        return self.df
    
    def normalisation(self):
        import pandas as pd
        temp=[]
        from sklearn import preprocessing
        temp.append(self.df.drop(self.variables, axis=1))
        self.df = preprocessing.normalize(temp[0])
        names = temp[0].columns
        temp[0] = pd.DataFrame(self.df, columns=names)
        print(f"{self.variables} dropped, rest columns are normalised")
        return temp[0]
            
    def standardisation(self):
        import pandas as pd
        temp=[]
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        temp.append(self.df.drop(self.variables, axis=1))
        self.df = scaler.fit_transform(temp[0])
        names = temp[0].columns
        temp[0] = pd.DataFrame(self.df, columns=names)
        print(f"{self.variables} dropped, rest columns are standardised")
        return temp[0]
    
    def change_str(self):
        for variable in self.variables:
            self.df[variable] = self.df[variable].astype(str)
            print(f"{variable} column changed to str")
        
    def change_int(self):
        for variable in self.variables:
            self.df[variable] = self.df[variable].astype(int)
            print(f"{variable} column changed to int")
        
    def change_float(self):
        for variable in self.variables:
            self.df[variable] = self.df[variable].astype(float)
            print(f"{variable} column changed to float")

    def movefront(self):
        cols = self.df.columns.tolist()
        remaining_cols = set(cols) - set(self.variables) 
        remaining_cols = list(remaining_cols)
        new_cols = self.variables + remaining_cols
        self.df = self.df[new_cols] 
        return self.df
    
    def moveend(self):
        cols = self.df.columns.tolist()
        remaining_cols = set(cols) - set(self.variables) 
        remaining_cols = list(remaining_cols)
        new_cols = remaining_cols + self.variables
        self.df = self.df[new_cols] 
        return self.df
    
    def group_by_mean(self):
        temp = self.df.groupby(self.variables).size().reset_index(name='groupby_counts')
        counts = temp['groupby_counts'].values.tolist()
        self.df = self.df.groupby(variables).mean()
        self.df['groupby_counts'] = counts
        print(f"Columns are groped, rest columns summarised by mean, how many columns summarised by mean are counted")
        return self.df
    
class Morris_data:
    def __init__(self, df):
        self.df=df
        
    def trim(self):
        self.df.columns = self.df.columns.str.strip()
        self.df = self.df.drop_duplicates()
        self.df.columns = self.df.columns.str.lower()
        self.df.columns = self.df.columns.str.replace(' ','_')
        df_obj = self.df.select_dtypes(['object'])
        self.df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
        print("All column names have been striped, lowered case, replaced space with underscore if any")
        print("Dropped duplicated instances if any")
        print("Categorical instances have been striped")
        return self.df
    
    def impute(self):
        from sklearn.impute import SimpleImputer
        for col in self.df.columns:
            if self.df[col].dtype == object:
                imp = SimpleImputer(strategy="most_frequent")
                self.df[col] = imp.fit_transform(self.df[[col]]).ravel()
                print(f"imputed mode for {col}")
            else:
                impmean = SimpleImputer(strategy="mean")
                self.df[col] = impmean.fit_transform(self.df[[col]]).ravel()
                print(f"imputed mean for {col}")
                
    def iqr(self):
        for col in self.df.columns:
            if self.df[col].dtype != object:
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                S = 1.5*IQR
                LB = Q1 - S
                UB = Q3 + S
                self.df.loc[self.df[col] > UB,col] = UB
                self.df.loc[self.df[col] < LB,col] = LB
                print(f"{col} outliers are replaced by [{LB}, {UB}] by IQR")
            else:
                break
        print("FINISHED - All column outliers are replaced by lower or upper boundary of interquartile rules")
        return self.df
    
    def select_numeric(self):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        newdf = self.df.select_dtypes(include=numerics)
        return newdf
    
    def powertransform(self):
        from sklearn.preprocessing import PowerTransformer
        names = self.df.columns
        df_filter = self.df[self.df > 0]
        pt = PowerTransformer(method='box-cox')
        temp_transform = pt.fit_transform(df_filter)
        df_new = pd.DataFrame(temp_transform, columns=names)
        return df_new
    
    def distribution(self):
        list(set(self.df.dtypes.tolist()))
        # include only float and integer
        df_num = self.df.select_dtypes(include = ['float64', 'int64'])
        return df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);
    
    def corr_sort_positive(self):
        import numpy as np
        import pandas as pd
        corr_matrix = self.df.corr()
        sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                          .stack()
                          .sort_values(ascending=False))
        corr = pd.DataFrame(sol, columns=['corr'])
        return corr.head(50)
    
    def corr_sort_negative(self):
        import numpy as np
        import pandas as pd
        corr_matrix = self.df.corr()
        sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                          .stack()
                          .sort_values(ascending=True))
        corr = pd.DataFrame(sol, columns=['corr'])
        return corr.head(50)
    
class Morris_discrete:
    def __init__(self, df, variables, categories):
        self.df=df
        self.variables=variables 
        self.categories=categories 
        
    def qcut(self):
        import pandas as pd
        for variable in self.variables:
            self.df[variable] = pd.qcut(self.df[variable], q = self.categories, precision = 0)
            return self.df
        
    def bincut(self):
        import pandas as pd
        for variable in self.variables:
            self.df[variable] = pd.cut(self.df[variable], bins = self.categories)
            
class Morris_toy:
    def __init__(self, df):
        self.df=df
        
    def boston(self):
        import pandas as pd
        import numpy as np
        from sklearn.datasets import load_boston
        boston = load_boston()
        self.df = pd.DataFrame(boston.data, columns = boston.feature_names)
        self.df['target'] = pd.DataFrame(boston.target, columns = np.array(['target']))
        return self.df
    
    def iris(self):
        import pandas as pd
        import numpy as np
        from sklearn.datasets import load_iris
        iris = load_iris()
        self.df = pd.DataFrame(iris.data, columns = iris.feature_names)
        self.df['target'] = pd.DataFrame(iris.target, columns = np.array(['target']))
        return self.df
    
    def digits(self):
        import pandas as pd
        import numpy as np
        from sklearn.datasets import load_digits
        digits = load_digits()
        self.df = pd.DataFrame(digits.data, columns = digits.feature_names)
        self.df['target'] = pd.DataFrame(digits.target, columns = np.array(['target']))
        return self.df
    
    def diabetes(self):
        import pandas as pd
        import numpy as np
        from sklearn.datasets import load_diabetes
        diabetes = load_diabetes()
        self.df = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
        self.df['target'] = pd.DataFrame(diabetes.target, columns = np.array(['target']))
        return self.df

# Import Dataset

In [2]:
import os
import pandas as pd
# Set path for new working directory
path = "C:/Users/Lee Kah Win/Desktop/"
os.chdir(path)
os.getcwd()

'C:\\Users\\Lee Kah Win\\Desktop'

In [None]:
df = pd.read_csv('covid_records.csv')
df0 = df
df0

# Preprocess the dataset

In [None]:
df0 = Morris_data(df0).trim()
encode = ['locations_visited', 'health_condition']
Morris(df0, encode).multi_encode()

In [None]:
drop = ['mysejahtera_id', 'user_id', 'name', 'email']
df0 = Morris(df0, drop).multi_drop()
df0

# Define Classification ML Models

In [None]:
class Morris_classification:
    def __init__(self, score):
        self.score=score
    
    def rfc(self):
        print("Random Forest Classification is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        rfc=RandomForestClassifier(random_state=0)
        param_grid_rfr = { 
            'n_estimators': [200, 500],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,5,6,7,8,10, None],
            'criterion' :['gini', 'entropy']
        }
        record_score = []
        record_param = []
        performance_df['rfr_training'] = record_score
        performance_df['rfr_param'] = record_param
        for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
            grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid_rfr, cv= 5, scoring = self.score)
            grid_result = grid_search.fit(X_train, y_train.values.ravel())
            record_score.append(grid_result.best_score_)
            record_param.append(grid_search.best_params_)
            print(grid_search.best_params_)
            print(f"Best {self.score} : {grid_result.best_score_}")
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df

    def naive_bayes(self):
        print("Naive Bayes Classification is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.model_selection import cross_val_score
        from sklearn.naive_bayes import GaussianNB
        from sklearn.naive_bayes import BernoulliNB
        from sklearn.naive_bayes import MultinomialNB
        naive_bayes = {'Gaussian_Naive_Bayes': GaussianNB(), 
                       'Bernoulli_Naive_Bayes': BernoulliNB(binarize = 0.0), 
                       'Multinomial_Naive_Bayes': MultinomialNB()}
        for key, value in naive_bayes.items():
            record_score = []
            performance_df[key] = record_score
            for (key_X_train,X_train), (key_y_train,y_train) , (key_X_test,X_test), (key_y_test,y_test) in zip(X_train_data.items(), y_train_data.items(), X_test_data.items(), y_test_data.items()):
                x = cross_val_score(value, X_train, y_train.values.ravel(), cv=5,scoring = self.score)
                record_score.append(x.mean())
                print(f"{key} {key_X_train} {self.score} : {x.mean()}")
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df
    
    def knn(self):
        print("K-Nearest Neighbors Classification is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import r2_score, median_absolute_error, mean_squared_error
        knn = KNeighborsClassifier()
        start, end = 2, 40
        k_range = []  
        # iterating each number in list
        for num in range(start, end + 1):
            if num % 2 != 0:
                k_range.append(num)
        weight_options = ["uniform", "distance"]
        param_grid_values = dict(n_neighbors = k_range, weights = weight_options) 
        record_score = []
        record_param = []
        performance_df['knn_training'] = record_score
        performance_df['knn_param'] = record_param
        for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
            grid_search = GridSearchCV(estimator=knn, param_grid=param_grid_values, cv= 5, scoring = self.score)
            grid_result = grid_search.fit(X_train, y_train.values.ravel())
            record_score.append(grid_result.best_score_)
            record_param.append(grid_search.best_params_)
            print(grid_search.best_params_)
            print(f"Best {self.score} : {grid_result.best_score_}")
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df
    
    def dtc(self):
        print("Decision Tree Classification is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        dtc = DecisionTreeClassifier(random_state=0)
        sample_split_range = list(range(2, 10))
        criterion_options = ['gini', 'entropy']
        max_depth_options = [None, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41]
        num_leafs = [1, 5, 10, 20, 50]
        record_score = []
        record_param = []
        performance_df['dtc_training'] = record_score
        performance_df['dtc_param'] = record_param 
        param_grid_values = dict(min_samples_split=sample_split_range, 
                          criterion = criterion_options, max_depth = max_depth_options,
                         min_samples_leaf = num_leafs)
        for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
            grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid_values, cv= 5, scoring = self.score)
            grid_result = grid_search.fit(X_train, y_train)
            record_score.append(grid_result.best_score_)
            record_param.append(grid_search.best_params_)
            print(grid_search.best_params_)
            print(f"Best {self.score} : {grid_result.best_score_}")
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df
    
    def svc(self):
        print("Support Vector Classification is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.svm import SVC
        from sklearn.model_selection import GridSearchCV
        SVC = {'SVC_LINEAR': SVC(kernel='linear'), 
               'SVC_POLY': SVC(kernel='poly'), 'SVC_RBF': SVC(kernel='rbf'), 
               'SVC_SIGMOID': SVC(kernel='sigmoid')}
        SVC_param = {'SVC_LINEAR_param': 1,'SVC_POLY_param': 2, 'SVC_RBF_param': 3, 'SVC_SIGMOID_param': 4}
        for (key, value), (param_key, param_value) in zip(SVC.items(), SVC_param.items()):
            record_score = []
            record_param = []
            performance_df[key] = record_score
            performance_df[param_key] = record_param 
            for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
                if key == 'LINEAR':
                    param_grid = {'C': [0.01,0.0125,0.02,0.03,0.04,0.05 ]}
                elif key == 'POLY':
                    param_grid = {'C': [0.01,0.1,1, 10, 25, 50,100, 1000,10000], 'degree': [2, 3, 4, 5], 'gamma': ['scale','auto']}
                elif key == 'RBF':
                    param_grid = {'C': [0.5,0.75,1, 1.25,1.5,1.75,2],'gamma': ['scale','auto'],'kernel': ['rbf']}
                else:
                    param_grid = {'C': [0.1,0.125,0.2,0.3,0.4 ], 'gamma': ['scale','auto']}
                grid_search = GridSearchCV(value, param_grid, cv=5, scoring=self.score, n_jobs=-1)
                grid_result = grid_search.fit(X_train, y_train.values.ravel())
                record_score.append(grid_result.best_score_)
                record_param.append(grid_search.best_params_)
                print(f"{key} - {grid_search.best_params_}")
                print(f"Best {self.score} {key_X_train}: {grid_result.best_score_}")
                print('') 
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df
    
    def voting(self):
        print("Ensemble Learning Voting Classifier is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.ensemble import VotingClassifier
        from sklearn.model_selection import cross_val_score
        model_voting = VotingClassifier(estimators = [('clf1', clf1), ('clf2',clf2),
                                                     ('clf3', clf3), ('clf4', clf4),
                                                     ('clf5', clf5)])
        record_score = []
        performance_df['voting_training'] = record_score
        for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
            scores = cross_val_score(model_voting, X_train, y_train.values.ravel(), cv = 5, scoring = self.score)
            print(f"{key_X_train} {self.score} = {scores.mean()}")
            record_score.append(scores.mean())
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df, model_voting
    
    def stacking(self):
        print("Ensemble Learning Stacking Classifier is executed")
        print("Found the best parameters and best score with GridSearchCV")
        import time
        start_time = time.time()
        from sklearn.ensemble import StackingClassifier
        from sklearn.model_selection import cross_val_score
        model_stacking = StackingClassifier(estimators = [ ('clf1',clf1), ('clf2',clf2),
                                                          ('clf3', clf3), ('clf4', clf4),
                                                          ('clf5', clf5)],
                                            final_estimator = clf5)
        record_score = []
        performance_df['stacking_training'] = record_score
        for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
            scores = cross_val_score(model_stacking, X_train, y_train.values.ravel(), cv = 5, scoring = self.score)
            print(f"{key_X_train} {self.score} = {scores.mean()}")
            record_score.append(scores.mean())
        print("Executed time is %s seconds " % (time.time() - start_time))
        print("Values are stored into a dataframe")
        return performance_df, model_stacking
    
    def predict(self):
        from sklearn import metrics 
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
        for key, model in models.items():
            record_score = []
            performance_df[key] = record_score
            for (key_X_train,X_train), (key_y_train,y_train) , (key_X_test,X_test), (key_y_test,y_test) in zip(X_train_data.items(), y_train_data.items(), X_test_data.items(), y_test_data.items()):
                model.fit(X_train, y_train.values.ravel())
                test_predict = model.predict(X_test)
                record_score.append(metrics.accuracy_score(y_test.values.ravel(), test_predict))
                print(f"{key_X_train} {key} {self.score}: {metrics.accuracy_score(y_test.values.ravel(), test_predict)}")
        return performance_df

# Supervised Learning - Classification

## To identify the next individual that will contribute clusters

In [None]:
y = df0[['new_cluster_contributor']]
drop = ['new_cluster_contributor']
df0 = Morris(df0, drop).multi_drop()
df0

# Training Set and Testing Set Split

In [None]:
class Morris_split:
    def __init__(self, X, y):
        self.X=X
        self.y=y
    
    def train_test_split(self):
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=0)
        print(f"the training shape of x and y are {X_train.shape}, {y_train.shape}")
        print(f"the testing shape of x and y are {X_test.shape}, {y_test.shape}")
        return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = Morris_split(df0, y).train_test_split()
X_train

In [None]:
performance_df = {}
performance_df['datasets'] = ['df0']
X_train_data = {'df0': X_train}
y_train_data = {'df0': y_train}
X_test_data = {'df0': X_test}
y_test_data = {'df0': y_test}
# test workability
for (key_X_train,X_train), (key_y_train,y_train) in zip(X_train_data.items(), y_train_data.items()):
    print(key_X_train)

### Train the data with Naive Bayes

In [None]:
Morris_classification('accuracy').naive_bayes()

### Train the data with Decision Tree

In [None]:
Morris_classification('accuracy').dtc()

### Train the data with Random Forest

In [None]:
Morris_classification('accuracy').rfc()

In [None]:
result = pd.DataFrame(data=performance_df)
result

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
clf1 = DecisionTreeClassifier(random_state=0, criterion= 'gini', 
                      max_depth= 7, min_samples_leaf= 5, min_samples_split= 2)
clf2 = RandomForestClassifier(criterion= 'entropy', max_depth= 10, 
        max_features= 'log2', n_estimators= 500)
clf3 = KNeighborsClassifier(n_neighbors= 35, weights= 'distance')
clf4 = SVC(kernel='linear', C= 0.03, gamma= 'scale')
clf5 = GaussianNB()

### Fit the Naive Bayes into Testing Set to see the generalisation

In [None]:
from sklearn import metrics 
from sklearn.metrics import accuracy_score
clf5.fit(X_train, y_train.values.ravel())
test_predict = clf5.predict(X_test)
test_predict

### Accuracy for unseen data

In [None]:
print("Accuracy : ", metrics.accuracy_score(y_test, test_predict))

### Pull the data of the red flagged user predicted by Machine Learning

In [None]:
X_test['predicted_new_contributors'] = test_predict.tolist()
X_test_positive = X_test[X_test['predicted_new_contributors']==1]
X_test_positive

In [None]:
row_list = []
for row in X_test_positive.index:
    row_list.append(row)
row_list

# the red flagged individual

In [None]:
predicted_new_cluster_contributor = df.iloc[row_list]
predicted_new_cluster_contributor

In [None]:
big = [['NAME', 'MYSEJAHTERA ID', 'ACTION', 'NOTIFICATIONS']]
for index, row in predicted_new_cluster_contributor.iterrows():
    temp = []
    temp.append(row['NAME'])
    temp.append(row['MYSEJAHTERA ID'])
    temp.append("Need to be Quarantined Immediately")
    temp.append("Notification Sent, MySej Updated")
    big.append(temp)

In [None]:
for index, row in predicted_new_cluster_contributor.iterrows():
    print(f"{row['NAME']} cannot go anywhere, any scanning in Mysejahtera LAW ENFORCEMENT will be noticed for 14 days")
    print(f"A **STAY_AT_HOME_ORDER** was email has been sent to {row['NAME']} at {row['EMAIL']}")
    print(f"**RED FLAGGED** has been updated to {row['NAME']} in My sejahtera ID: {row['MYSEJAHTERA ID']}")

# Violations Automating

In [None]:
scan_number = 0
Farah = {'violations': scan_number }
Farah

In [None]:
# Envisioning the violation scan by the mysejahtera app during quarantine period
scan_number = scan_number+1
scan_number

In [None]:
income_grop = 'B40'

In [None]:
def violations_monitor(scan_number):
    if scan_number > 0:
        if scan_number ==1:
            if income_grop == 'B40':
                print('2 more violations - RM800 punishment will be imposed.')
            elif income_grop == 'M40':
                print('2 more violations - RM2500 punishment will be imposed.')
            elif income_grop == 'T40':
                print('2 more violations - RM5000 punishment will be imposed.')
        elif scan_number ==2:
            if income_grop == 'B40':
                print('1 more violation - RM800 punishment will be imposed.')
            elif income_grop == 'M40':
                print('1 more violation - RM2500 punishment will be imposed.')
            elif income_grop == 'T40':
                print('1 more violation - RM5000 punishment will be imposed.')       
        elif scan_number ==3:
            if income_grop == 'B40':
                print('RM800 punishment was imposed. Link created to make a transfer in 14 days')
            elif income_grop == 'M40':
                print('RM2500 punishment was imposed. Link created to make a transfer in 14 days')
            elif income_grop == 'T40':
                print('RM5000 punishment was imposed. Link created to make a transfer in 14 days')
    else:
        print('no violations')

In [None]:
violations_monitor(scan_number)

# Task 2 Unsupervised Learning Association Rules

### How can we accurately predict know which business location to close down to mitigate the COVID-19?

In [3]:
df1 = pd.read_csv('location_records.csv')
df2 = df1
df2

Unnamed: 0,MYSEJAHTERA ID,USER ID,NAME,DATE,EMAIL,AGE,LOCATIONS VISITED LEAD TO COVID19
0,M2,2,ALI,5/5/2021,ALI@gmail.com,62,MAMAK213
1,M2,2,ALI,6/5/2021,ALI@gmail.com,62,SHOP_XX3
2,M2,2,ALI,7/5/2021,ALI@gmail.com,62,SHOP_XX4
3,M2,2,ALI,8/5/2021,ALI@gmail.com,62,SHOP_XX5
4,M2,2,ALI,9/5/2021,ALI@gmail.com,62,SHOP_XX3
5,M2,2,ALI,10/5/2021,ALI@gmail.com,62,MAMAK213
6,M2,2,ALI,11/5/2021,ALI@gmail.com,62,COVID19_DETECTED
7,M3,3,FARAH,5/5/2021,FARAH@gmail.com,23,SHOP_XX3
8,M3,3,FARAH,6/5/2021,FARAH@gmail.com,23,SHOP_XX4
9,M3,3,FARAH,7/5/2021,FARAH@gmail.com,23,SHOP_XX5


In [4]:
df2.columns

Index(['MYSEJAHTERA ID', 'USER ID', 'NAME', 'DATE', 'EMAIL', 'AGE',
       'LOCATIONS VISITED LEAD TO COVID19'],
      dtype='object')

In [5]:
drop = ['USER ID', 'NAME', 'DATE', 'EMAIL', 'AGE']
df2 = Morris(df2, drop).multi_drop()
df2 = Morris_data(df2).trim()
df2

USER ID column is dropped
NAME column is dropped
DATE column is dropped
EMAIL column is dropped
AGE column is dropped
All column names have been striped, lowered case, replaced space with underscore if any
Dropped duplicated instances if any
Categorical instances have been striped


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,mysejahtera_id,locations_visited_lead_to_covid19
0,M2,MAMAK213
1,M2,SHOP_XX3
2,M2,SHOP_XX4
3,M2,SHOP_XX5
6,M2,COVID19_DETECTED
7,M3,SHOP_XX3
8,M3,SHOP_XX4
9,M3,SHOP_XX5
10,M3,SHOP_XX6
11,M3,MAMAK213


In [6]:
import mlxtend
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from ast import literal_eval

In [7]:
# Group all movieId by userId for association rules mining
df_all = df2.groupby("mysejahtera_id")["locations_visited_lead_to_covid19"].apply(list).reset_index()
print(df_all.shape)
df_all.head()

(3, 2)


Unnamed: 0,mysejahtera_id,locations_visited_lead_to_covid19
0,M2,"[MAMAK213, SHOP_XX3, SHOP_XX4, SHOP_XX5, COVID..."
1,M3,"[SHOP_XX3, SHOP_XX4, SHOP_XX5, SHOP_XX6, MAMAK..."
2,M4,"[SHOP_XX3, SHOP_XX4, COVID19_DETECTED]"


In [8]:
from ast import literal_eval
df_all.locations_visited_lead_to_covid19 = df_all.locations_visited_lead_to_covid19
s = df_all['locations_visited_lead_to_covid19'].explode()
df_all = df_all[['mysejahtera_id']].join(pd.crosstab(s.index, s))
df_all = df_all.drop('mysejahtera_id', axis = 1)
print(f"df_all data shape : {df_all.shape}")
df_all.head()

df_all data shape : (3, 6)


Unnamed: 0,COVID19_DETECTED,MAMAK213,SHOP_XX3,SHOP_XX4,SHOP_XX5,SHOP_XX6
0,1,1,1,1,1,0
1,1,1,1,1,1,1
2,1,0,1,1,0,0


In [9]:
def association(data_rating):
    frequent = apriori(data_rating, min_support = 1, use_colnames = True)
    frequent['length'] = frequent['itemsets'].apply(lambda x: len(x))
    frequent[(frequent['length'] == 1) & \
             (frequent['support'] >= 1)]
    data_rating = association_rules(frequent, metric = "lift", min_threshold = 1)
    data_rating = data_rating[(data_rating['lift'] >= 1) & (data_rating['confidence'] >= 1)]
    return data_rating

In [10]:
rules_data = association(df_all)

In [11]:
rules_data

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(SHOP_XX3),(COVID19_DETECTED),1.0,1.0,1.0,1.0,1.0,0.0,inf
1,(COVID19_DETECTED),(SHOP_XX3),1.0,1.0,1.0,1.0,1.0,0.0,inf
2,(SHOP_XX4),(COVID19_DETECTED),1.0,1.0,1.0,1.0,1.0,0.0,inf
3,(COVID19_DETECTED),(SHOP_XX4),1.0,1.0,1.0,1.0,1.0,0.0,inf
4,(SHOP_XX3),(SHOP_XX4),1.0,1.0,1.0,1.0,1.0,0.0,inf
5,(SHOP_XX4),(SHOP_XX3),1.0,1.0,1.0,1.0,1.0,0.0,inf
6,"(SHOP_XX3, SHOP_XX4)",(COVID19_DETECTED),1.0,1.0,1.0,1.0,1.0,0.0,inf
7,"(SHOP_XX3, COVID19_DETECTED)",(SHOP_XX4),1.0,1.0,1.0,1.0,1.0,0.0,inf
8,"(SHOP_XX4, COVID19_DETECTED)",(SHOP_XX3),1.0,1.0,1.0,1.0,1.0,0.0,inf
9,(SHOP_XX3),"(SHOP_XX4, COVID19_DETECTED)",1.0,1.0,1.0,1.0,1.0,0.0,inf


In [12]:
def transform2a_2c(rules_data):
    import numpy as np
    a = [list(x) for x in rules_data.antecedents.values]
    c = [list(x) for x in rules_data.consequents.values]
    r = pd.DataFrame(np.array(a),
                       columns=['antecedents'])
    c = pd.DataFrame(np.array(c),
                       columns=['consequents'])
    r['consequents'] = c
    r[['antecedent 1','antecedent 2']] = pd.DataFrame(r.antecedents.tolist(), index= r.index)
    r[['consequent 1', 'consequent 2']] = pd.DataFrame(r.consequents.tolist(), index= r.index)
    r.drop(['antecedents','consequents'],inplace=True,axis=1)
    lift = rules_data[['lift']].reset_index(drop = True)
    r['lift'] = lift
    confidence = rules_data[['confidence']].reset_index(drop = True)
    r['confidence'] = confidence
    support = rules_data[['support']].reset_index(drop = True)
    r['support'] = support
    return r

In [13]:
asso_results = transform2a_2c(rules_data)
asso_results

Unnamed: 0,antecedent 1,antecedent 2,consequent 1,consequent 2,lift,confidence,support
0,SHOP_XX3,,COVID19_DETECTED,,1.0,1.0,1.0
1,COVID19_DETECTED,,SHOP_XX3,,1.0,1.0,1.0
2,SHOP_XX4,,COVID19_DETECTED,,1.0,1.0,1.0
3,COVID19_DETECTED,,SHOP_XX4,,1.0,1.0,1.0
4,SHOP_XX3,,SHOP_XX4,,1.0,1.0,1.0
5,SHOP_XX4,,SHOP_XX3,,1.0,1.0,1.0
6,SHOP_XX3,SHOP_XX4,COVID19_DETECTED,,1.0,1.0,1.0
7,SHOP_XX3,COVID19_DETECTED,SHOP_XX4,,1.0,1.0,1.0
8,SHOP_XX4,COVID19_DETECTED,SHOP_XX3,,1.0,1.0,1.0
9,SHOP_XX3,,SHOP_XX4,COVID19_DETECTED,1.0,1.0,1.0


In [14]:
asso_results_filter = asso_results[(asso_results['consequent 2']=='COVID19_DETECTED') | 
                                   (asso_results['consequent 1']=='COVID19_DETECTED')]
asso_results_filter

Unnamed: 0,antecedent 1,antecedent 2,consequent 1,consequent 2,lift,confidence,support
0,SHOP_XX3,,COVID19_DETECTED,,1.0,1.0,1.0
2,SHOP_XX4,,COVID19_DETECTED,,1.0,1.0,1.0
6,SHOP_XX3,SHOP_XX4,COVID19_DETECTED,,1.0,1.0,1.0
9,SHOP_XX3,,SHOP_XX4,COVID19_DETECTED,1.0,1.0,1.0
10,SHOP_XX4,,SHOP_XX3,COVID19_DETECTED,1.0,1.0,1.0


In [15]:
print(asso_results_filter)

   antecedent 1 antecedent 2      consequent 1      consequent 2  lift  \
0      SHOP_XX3         None  COVID19_DETECTED              None   1.0   
2      SHOP_XX4         None  COVID19_DETECTED              None   1.0   
6      SHOP_XX3     SHOP_XX4  COVID19_DETECTED              None   1.0   
9      SHOP_XX3         None          SHOP_XX4  COVID19_DETECTED   1.0   
10     SHOP_XX4         None          SHOP_XX3  COVID19_DETECTED   1.0   

    confidence  support  
0          1.0      1.0  
2          1.0      1.0  
6          1.0      1.0  
9          1.0      1.0  
10         1.0      1.0  


In [16]:
filtered = asso_results_filter[['antecedent 1']]
filtered = filtered.values.tolist()
flat_list = [item for sublist in filtered for item in sublist]
flat_list

['SHOP_XX3', 'SHOP_XX4', 'SHOP_XX3', 'SHOP_XX3', 'SHOP_XX4']

In [17]:
def unique(list1):
    list_set = set(list1)
    unique_list = (list(list_set))
    temp = []
    for x in unique_list:
        temp.append(x)
    return temp

In [18]:
flat_list = unique(flat_list)
flat_list

['SHOP_XX3', 'SHOP_XX4']

In [19]:
big = []
for x in flat_list:
    temp = []
    temp.append(x)
    temp.append("High Risk")
    big.append(temp)

In [20]:
big

[['SHOP_XX3', 'High Risk'], ['SHOP_XX4', 'High Risk']]

### From the result of association rules, we identified that SHOP_XX3 and SHOP_XX4 must be closed, which are / can be the warm bed of COVID-19

# What Next?

1. An automate email will be sent to the business contact person
1. for the business my_sejahtera id can be red flagged just like the supervised learning
2. So if any scan on the id, it will be triggered a notice