In [99]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [100]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/DATA QUALITY MANAGEMENT')

In [101]:
%cd /content/drive/My Drive/Colab Notebooks/DATA QUALITY MANAGEMENT

/content/drive/My Drive/Colab Notebooks/DATA QUALITY MANAGEMENT


In [102]:
import pandas as pd
import numpy as np
import random
import string
import re
import pickle

#Load Dictionary and Trained Machine Learning Models

In [103]:
# Load Dictionary
dictionary = pd.read_excel('data_dict.xlsx')
dictionary

Unnamed: 0,model,model_name,vectorizer_name,meta,data_type,invalid,valid
0,DT,email_validation_single_dt_model.joblib,email_validation_single_lstm_vectorizer.pkl,email,string,"^[^\w]+|[^\w]+$ [^a-zA-Z0-9_.@-] \s{2,}",
1,LR,name_validation_single_logistic_regression_mod...,name_validation_single_lstm_vectorizer.pkl,name,string,"[^A-Za-z\s]+ [^A-Za-z\s]+ \s{2,}",


In [104]:
# Load Machine Learning Model and CountVectorizer Data (Email Model)
from joblib import load
#model email
email_validation_info  = dictionary[dictionary['meta'] == 'email']

invalid_email_regex    = email_validation_info['invalid'].values[0]
valid_email_regex      = email_validation_info['valid'].values[0]

# Load Model from File name
email_validation_model = email_validation_info['model_name'].values[0]
print("Load model from :", 'model/'+email_validation_model)
email_validation_model = load('model/'+email_validation_model)
email_validation_model

Load model from : model/email_validation_single_dt_model.joblib


In [105]:
# Load Vectorizer (Email)
email_validation_vectorizer = email_validation_info['vectorizer_name'][0]
print("Load vectorizer :", 'model/'+email_validation_vectorizer)
with open('model/'+email_validation_vectorizer, 'rb') as f:
    email_validation_vectorizer = pickle.load(f)
email_validation_vectorizer

Load vectorizer : model/email_validation_single_lstm_vectorizer.pkl


In [106]:
# Load Machine Learning Model and CountVectorizer Data (Name Model)
from joblib import load
#model email
name_validation_info  = dictionary[dictionary['meta'] == 'name']

invalid_name_regex    = name_validation_info['invalid'].values[0]
valid_email_regex     = name_validation_info['valid'].values[0]

# Load Model from File name
name_validation_model   = name_validation_info['model_name'].values[0]
print("Load model from :", 'model/'+name_validation_model)
name_validation_model   = load('model/'+name_validation_model)
name_validation_model

Load model from : model/name_validation_single_logistic_regression_model.joblib


In [107]:
# Load Vectorizer (Name)
name_validation_vectorizer = name_validation_info['vectorizer_name'].values[0]
print("Load vectorizer :", 'model/'+name_validation_vectorizer)
with open('model/'+name_validation_vectorizer, 'rb') as f:
    name_validation_vectorizer = pickle.load(f)
name_validation_vectorizer

Load vectorizer : model/name_validation_single_lstm_vectorizer.pkl


In [108]:
text = 'Alice?Brown'
text_transformed = name_validation_vectorizer.transform([text])
prediction = name_validation_model.predict(text_transformed)
prediction

array([0])

In [109]:
# Cleaning Function
def cleaning_text_process(invalid_regex, text, replacment) :
  regex_patterns = invalid_regex.split(' ')
  regex_patterns = [re.compile(pattern) for pattern in regex_patterns]
  cleaned_string = text
  for pattern in regex_patterns:
      cleaned_string = pattern.sub(replacment, cleaned_string)
  cleaned_string = cleaned_string.strip()
  # print(cleaned_string)
  return cleaned_string

def predict_text(text, vectorizer, model) :
  text_transformed = vectorizer.transform([text])
  prediction = model.predict(text_transformed)
  model = None
  vectorizer = None
  return prediction

# Machine Learning Prediction Function
def predict_text_validity(text, vectorizer, model, invalid_regex, replacement):
    prediction = predict_text(text, vectorizer, model)
    if prediction[0]:
      return [text, prediction[0], text, predict_text(text, vectorizer, model)[0]]
    else:
      return [text, prediction[0], cleaning_text_process(invalid_regex, text, replacement), predict_text(cleaning_text_process(invalid_regex, text, replacement), vectorizer, model)[0]]

#Prediction Function

In [110]:
#Invalid Email Prediction and Cleansing Process
email   = 'user@domain'
result  = predict_text_validity(email,email_validation_vectorizer,email_validation_model, invalid_email_regex,'')
result
#List : Before, Status, After, Final Status

['user@domain', 0, 'user@domain', 0]

In [111]:
#Invalid Name Prediction and Cleansing Process
name   = 'Alice?Brown'
result  = predict_text_validity(name, name_validation_vectorizer,name_validation_model, invalid_name_regex,' ')
result
#Before, Status, After

['Alice?Brown', 0, 'Alice Brown', 1]

# Test Using Excel Data

In [112]:
#Load Test Excel
testing = pd.read_excel('testing.xlsx')
testing

Unnamed: 0,name,email
0,John-Doe,john.doe@example
1,Jane Smith .Jr,user@domain
2,Kirsya Mars1a,invalid-email.com
3,Kaido Ren,123@domain.com
4,Bob Johnson,user@domain..com
5,Alice?Brown,alice.brown@example.com
6,Robert&Miller,user123@domain.net
7,Emma Taylor,first.last@sub.domain.co.uk
8,William Clark,user_name123@company.io
9,Sarah 12Wilson,user+tag@email-provider.com


In [113]:
column_names = testing.columns
input_data = []
status = []
output = []
final_status = []
meta = []

# Dictionary
validation_models = {
    'email': (email_validation_vectorizer, email_validation_model, invalid_email_regex, ''),
    'name': (name_validation_vectorizer, name_validation_model, invalid_name_regex, ' ')
}

for name in column_names:
    data = testing[name]
    for text in data:
        vectorizer, model, regex, replacement = validation_models[name]
        result = predict_text_validity(text, vectorizer, model, regex, replacement)
        input_data.append(result[0])
        status.append(result[1])
        output.append(result[2])
        final_status.append(result[3])
        meta.append(name)


In [114]:
# Create DataFrame for results
result_df = pd.DataFrame({
    'Input': input_data,
    'Meta' : meta,
    'Status': status,
    'Output': output,
    'Final Status': final_status
})

result_df

Unnamed: 0,Input,Meta,Status,Output,Final Status
0,John-Doe,name,0,John Doe,1
1,Jane Smith .Jr,name,0,Jane Smith Jr,1
2,Kirsya Mars1a,name,0,Kirsya Mars a,1
3,Kaido Ren,name,1,Kaido Ren,1
4,Bob Johnson,name,1,Bob Johnson,1
5,Alice?Brown,name,0,Alice Brown,1
6,Robert&Miller,name,0,Robert Miller,1
7,Emma Taylor,name,1,Emma Taylor,1
8,William Clark,name,1,William Clark,1
9,Sarah 12Wilson,name,0,Sarah Wilson,1
