In [29]:
import pandas as pd

In [30]:
file_path1 = f"{ops_plan_proj_folder_location}/Todays Ops Plan Data - new env.xlsm"
file_path2 = f"{ops_plan_proj_folder_location}/Todays Ops Plan Data.xlsm"
new_env_df = pd.read_excel(file_path1, sheet_name="SR_Full")
old_env_df = pd.read_excel(file_path2, sheet_name="RR_Full")

In [None]:
# Note: Column names below are arbitrary placeholders to demonstrate data cleaning process
# while maintaining privacy of actual business data

# drop columns that are not relevant for analysis
old_env_cols_dropped = ['Market Segment', 'Performance Score', 'Sub Category', 'Task Priority', 'Request Number',
                      'Request Creator', 'Submission Date']
new_env_cols_dropped = ['Ticket ID', 'Submitter Name', 'Entry Date', 'Team Reference']

# columns to be renamed for consistency between old and new systems
old_env_cols_renamed = ['Priority Score', 'Group Name', 'Service Category', 'Support Staff', 'Request Reference',
                      'Department Lead', 'Status', 'Service Group', 'Team Lead',
                      'Business Unit', 'Active Project', 'OP$`Request Reference`', 'Division']

# new names for the columns being renamed
old_env_cols_renamed_to = ['Task Priority Total', 'Team Name', 'Primary Service: Type', 'Support Staff: Full Name',
                         'Ticket Name', 'Department Manager: Full Name', 'Current Status', 'Service Group Category', 'Team Lead: Full Name',
                         'Department Category', 'In Progress', 'OP$`Ticket Name`', 'Business Division']

In [32]:
# drop columns that are not needed for comparison
old_env_df_dropped = old_env_df.drop(old_env_cols_dropped, axis=1)
new_env_df_dropped = new_env_df.drop(new_env_cols_dropped, axis=1)

In [None]:
# what columns are in old that are not in new
old_env_cols = old_env_df_dropped.columns
new_env_cols = new_env_df.columns
old_env_cols_not_in_new = []
for col in old_env_cols:
    if col not in new_env_cols:
        old_env_cols_not_in_new.append(col)
print(old_env_cols_not_in_new)
# reverse it
new_env_cols_not_in_old = []
for col in new_env_cols:
    if col not in old_env_cols:
        new_env_cols_not_in_old.append(col)
print(new_env_cols_not_in_old)

In [34]:
print(len(old_env_df_dropped.columns))
print(len(new_env_df_dropped.columns))

46
47


In [None]:
# rename the old_env_df_dropped columns to match the new_env_df_dropped columns for only those that are in old_env_cols_renamed, and rename them to old_env_cols_renamed_to
old_env_df_renamed = old_env_df_dropped.rename(columns=dict(zip(old_env_cols_renamed, old_env_cols_renamed_to)))
old_env_df_renamed.columns

In [36]:
# how many unique Ticket names are there in each dataframe? 
print(len(old_env_df_renamed['Ticket Name'].unique()))
print(len(new_env_df_dropped['Ticket Name'].unique()))

271
264


In [None]:
# join the old_env_df_dropped dataframe to new_env_df_dropped on ['Ticket Request: Ticket Request Id' = 'Source ID: Ticket Request Id'
env_joined = pd.merge(old_env_df_renamed, new_env_df_dropped, how='outer', left_on='Ticket Name', 
                      right_on='Source ID: Request Id')
env_joined

In [None]:
# sort columns in alphabetical order
env_joined = env_joined.reindex(sorted(env_joined.columns), axis=1)
# move the 'Source ID: Ticket Request Id' column to the front
env_joined = env_joined[['Source ID: Request Id'] + [col for col in env_joined.columns if col != 'Source ID: Request Id']]
env_joined

In [39]:
# filter out the nan values from the 'Source ID:  Request Id' column
env_joined_no_nas = env_joined[env_joined['Source ID: Request Id'].notna()]

In [None]:
env_joined_no_nas.columns

In [41]:
#pip install fuzzywuzzy

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# for each column, compare the values in the _x column to the values in the _y column and print out the differences in a new dataframe 
# with the column names, the 'Source ID: Request Id' column and the old and new values
env_joined_diffs = pd.DataFrame(columns=['Source ID: Request Id', 'Column Name', 'Old Value', 'New Value'])
for col in env_joined_no_nas.columns:
    # if column name is not 'Source ID: Request Id' or contain '_y'
    if (col != 'Source ID: Request Id') & (col.find('_y') == -1):
        for index, row in env_joined_no_nas.iterrows():
            # remove the '_x' from the column name
            col = col.replace('_x', '')
            # if the values in the _x and _y columns are not equal, add the row to the env_joined_diffs dataframe
            # however, if they are both strings, and they are 90% similar, don't add them to the dataframe. And if they 
            # are both NaN, don't add them to the dataframe
            if (row[col + '_x'] != row[col + '_y']) & (type(row[col + '_x']) != str) & (type(row[col + '_y']) != str):
                env_joined_diffs = pd.concat([env_joined_diffs, pd.DataFrame({'Source ID: Request Id': [row['Source ID: Request Id']],
                                                             'Column Name': [col], 'Old Value': [row[col + '_x']], 'New Value': [row[col + '_y']]})])
            elif (row[col + '_x'] != row[col + '_y']) & (type(row[col + '_x']) == str) & (type(row[col + '_y']) == str):
                # give both column names a similarity score
                similarity_score = fuzz.ratio(row[col + '_x'], row[col + '_y'])
                # if the similarity score is less than 90, add the row to the env_joined_diffs dataframe
                if similarity_score < 85:
                    env_joined_diffs = pd.concat([env_joined_diffs, pd.DataFrame({'Source ID: Request Id': [row['Source ID: Request Id']],
                                                             'Column Name': [col], 'Old Value': [row[col + '_x']], 'New Value': [row[col + '_y']]})])
                    print(row['Source ID:  Request Id'], col, row[col + '_x'], row[col + '_y'], similarity_score)



In [None]:
# filter out any rows where both the "Old Value" and "New Value" columns are NaN
env_joined_diffs = env_joined_diffs[env_joined_diffs['Old Value'].notna() | env_joined_diffs['New Value'].notna()]

env_joined_diffs

In [45]:
# send env_joined to excel
env_joined_diffs.to_excel(f"{ops_plan_proj_folder_location}/env_joined_diffs_11_1.xlsx")

In [59]:
# Rename columns in new_env_df to match old_env_df
column_name_mapping = {}
for i in range(len(old_env_cols_renamed)):
    column_name_mapping[old_env_cols_renamed_to[i]] = old_env_cols_renamed[i]

new_env_df_dropped.rename(columns=column_name_mapping, inplace=True)

# Set the primary key as the index for both DataFrames
old_env_df_dropped.set_index('Request Reference', inplace=True)
new_env_df_dropped.set_index('Request Reference', inplace=True)

# Compare the two DataFrames
are_equal = old_env_df_dropped.equals(new_env_df_dropped)

# Check if the DataFrames are equal
if are_equal:
    print("The DataFrames are identical.")
else:
    print("The DataFrames are not identical.")


The DataFrames are not identical.


In [None]:
print(len(old_env_df_dropped.columns)) # 44
print(len(new_env_df_dropped.columns)) # 44
# how are the two dataframes labeled differently
print(old_env_df_dropped.columns.difference(new_env_df_dropped.columns))
# compare the index of the two dataframes
old_env_df_dropped.index.difference(new_env_df_dropped.index)

In [None]:
# filter those index values that are different in the old dataframe
old_env_df_dropped[old_env_df_dropped.index.isin(old_env_df_dropped.index.difference(new_env_df_dropped.index))]
# filter those index values that are different in the new dataframe
new_env_df_dropped[new_env_df_dropped.index.isin(new_env_df_dropped.index.difference(old_env_df_dropped.index))]

