In [1]:
import pandas as pd
import os

In [2]:
# This function creates the path to the main Excel file of this project
# Uses os library methods to ensure the file path works in both local and GitHub CI/CD environments
def create_file_path(file_relative_path):
    # Store the xlsx file dir into a var
    dir = '../data'
    
    # Iterate under dir data to have all files
    for filename in os.listdir(dir):
        # Condition to confirm which file is == year_make_model_df.xlsx
        if filename.endswith(file_relative_path):
            # Join the dir + filename to create the path to year_make_model_df.xlsx file  
            xlsx_file_fullpath = os.path.join(dir, filename)
            # Print the result
            print(xlsx_file_fullpath)
    return xlsx_file_fullpath

In [3]:
# Read excel 
df = pd.read_excel(create_file_path('year_make_model_df.xlsx'))

../data\year_make_model_df.xlsx


In [4]:
# Fuction to create df
def create_df(df, make):
    df_new = df[df['Make'] == make]
    return df_new

In [5]:
# Function to create list with unique items from given column
def data_from_column(df, column):
    # List to append the items
    data_column_list =[]

    # Iterate over given column
    for data in df[column]:
        # Append the items from the given column
        data_column_list.append(data)

    # Return a set with unique items from given column
    return set(data_column_list)

In [6]:
# Create a df to Lincoln make
df_lincoln = create_df(df, 'Lincoln')

In [6]:
# Check the string in the model column to confirm if changes are needed
data_from_column(df_lincoln, 'Model')

{'Aviator',
 'Blackwood',
 'Continental',
 'LS',
 'MKS',
 'MKT',
 'MKX',
 'MKZ',
 'MKZ (Push to Start)',
 'Mark LT',
 'Mark VIII',
 'Navigator',
 'Town Car',
 'Zephyr'}

### No changes needed in the column model

In [7]:
# Check the string in the security column to confirm if changes are needed
data_from_column(df_lincoln, 'Security')

{'Keyed Ignition: PATS Type C (Instrument Cluster)\nPush To Start: PATS Type C (Remote Function Actuator / Keyless Vehicle Module)',
 'New in 2009',
 'New in 2010',
 'PATS Type A (Stand Alone PATS Module)',
 'PATS Type B (Body Control Module)',
 'PATS Type C (Instrument Cluster)',
 'PATS Type C (Remote Function Actuator / Keyless Vehicle Module)',
 'PATS Type D (Steering Column Ignition Lock Module)',
 'PATS Type E (Powertrain Control Module)'}

### Changes needed:
- Keyed Ignition and Push to Start are configurations that differentiate the anti-theft system. Therefore, the rows containing these two strings will be separated using '\n' as the delimiter, creating two lines and relocating the strings 'Keyed Ignition' and 'Push to Start' to the 'Model' column, where they will be combined with the model name

- This is the result on how it should be after updatings:  
Year: 2000  
Make: Ford  
Model: F-XX (Keyed Ignition) OR F-XX (Push To Start)     
Security: PATS Type G (Instrument Cluster)

In [8]:
# Fuction to create df based on a given string
def create_df_from_str(df, column, string):
    df_new = df[df[column] == string]
    return df_new

In [9]:
# Function to explode in different lines based on a given pattern
def explode_lines(df, column, pattern):
    # Make a copy of the original df
    df_exploded = df.copy()
    # Split the string in 2 based on the delimiter given and 
    # The result was given inside a list 
    df_exploded[column] = df_exploded[column].str.split(pattern)
    # Explode in 2 lines based on the list items quantity 
    # ['item1', 'item2'], exploded in 2 lines
    df_exploded = df_exploded.explode(column)
    # Return the df exploded
    return df_exploded

In [10]:
# Function to move the strings before char (:) from column security to column model
def move_string_to_column_model(df):
    # Function to explode in separate lines strings from security column
    df_exploded = explode_lines(df, 'Security', '\n')
    # Reset the index
    df_resetted_index = df_exploded.reset_index(drop=True)
    # Split the string in the security column using the (:) as the delimeter
    df_resetted_index['Security'] = df_resetted_index['Security'].str.split(":")

    # Loop to iterate under the df indexes
    for idx in df_resetted_index.index:
        # Method to move the manufactured date to the model column
        df_resetted_index['Model'][idx] = f'{df_resetted_index['Model'][idx]} ({df_resetted_index['Security'][idx][0]})'
        # Remove manufactured date from security column
        df_resetted_index['Security'][idx].pop(0)

    # Extracts the string from list under security column  
    df_security_list = df_resetted_index['Security'].str.join(' ')

    # Update the df
    df_resetted_index['Security'] = df_security_list

    # Return a df with the strings before (:) moved to column model
    return df_resetted_index

In [11]:
# Call the function to create a df with the string should be changed under column security
df_lincoln_keyed_push_to_start = create_df_from_str(df_lincoln, 'Security', 'Keyed Ignition: PATS Type C (Instrument Cluster)\nPush To Start: PATS Type C (Remote Function Actuator / Keyless Vehicle Module)')

In [13]:
# Create a list with the indexes, they should be dropped from the df
lincoln_keyed_push_to_start_indexes_list = list(df_lincoln_keyed_push_to_start.index)

In [14]:
# Call the function to move the strings from column security to column model from MODEL: MKS
df_lincoln_mks_moved_strings = move_string_to_column_model(df_lincoln_keyed_push_to_start)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_resetted_index['Model'][idx] = f'{df_resetted_index['Model'][idx]} ({df_resetted_index['Security'][idx][0]})'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gui

In [15]:
# Print the df after moving the strings
df_lincoln_mks_moved_strings

Unnamed: 0,Year,Make,Model,Security,ParameterReset
0,2009,Lincoln,MKS (Keyed Ignition),PATS Type C (Instrument Cluster),Parameter Reset Required
1,2009,Lincoln,MKS (Push To Start),PATS Type C (Remote Function Actuator / Keyle...,Parameter Reset Required
2,2010,Lincoln,MKS (Keyed Ignition),PATS Type C (Instrument Cluster),Parameter Reset Required
3,2010,Lincoln,MKS (Push To Start),PATS Type C (Remote Function Actuator / Keyle...,Parameter Reset Required
4,2011,Lincoln,MKS (Keyed Ignition),PATS Type C (Instrument Cluster),Parameter Reset Required
5,2011,Lincoln,MKS (Push To Start),PATS Type C (Remote Function Actuator / Keyle...,Parameter Reset Required
6,2012,Lincoln,MKS (Keyed Ignition),PATS Type C (Instrument Cluster),Parameter Reset Required
7,2012,Lincoln,MKS (Push To Start),PATS Type C (Remote Function Actuator / Keyle...,Parameter Reset Required


In [41]:
# Concatenate all the keyed and push to start df to the main df
df_lincoln_mks_moved_strings_concated = pd.concat([df_lincoln, df_lincoln_mks_moved_strings])

In [42]:
# Remove the rows with the old Keyed Ignition and push to start string
df_lincoln_mks_dropped_old_string = df_lincoln_mks_moved_strings_concated.drop(index=lincoln_keyed_push_to_start_indexes_list)

In [43]:
# Print the strings from column model to confirm if they are updated
data_from_column(df_lincoln_mks_dropped_old_string, 'Model')

{'Aviator',
 'Blackwood',
 'Continental',
 'LS',
 'MKS',
 'MKS (Keyed Ignition)',
 'MKS (Push To Start)',
 'MKT',
 'MKX',
 'MKZ',
 'MKZ (Push to Start)',
 'Mark LT',
 'Mark VIII',
 'Navigator',
 'Town Car',
 'Zephyr'}

In [44]:
# Print the strings from columns security to confirm if they are updated
data_from_column(df_lincoln_mks_dropped_old_string, 'Security')

{' PATS Type C (Instrument Cluster)',
 ' PATS Type C (Remote Function Actuator / Keyless Vehicle Module)',
 'New in 2009',
 'New in 2010',
 'PATS Type A (Stand Alone PATS Module)',
 'PATS Type B (Body Control Module)',
 'PATS Type C (Instrument Cluster)',
 'PATS Type C (Remote Function Actuator / Keyless Vehicle Module)',
 'PATS Type D (Steering Column Ignition Lock Module)',
 'PATS Type E (Powertrain Control Module)'}

### PATS Type and Anti-Theft Module Location separation:
PATS (Passive Anti-Theft System) Type and Anti-Theft Module Location are currently in the same column. However, the PATS Type defines which key learning procedure should be performed. To make it easier to indicate the correct procedure, it is necessary to separate this information. See below how it is currently and how it should look after the update.

- Before changes:  
Year: 2000  
Make: Lincoln  
Model: L-XX (Built February 1st or earlier)   
Security: PATS Type G (Instrument Cluster)  

- After changes:  
Year: 2000  
Make: Lincoln  
Model: L-XX  
PATS Type: PATS Type G  
Anti-Theft Module Location: Instrument Cluster

In [45]:
# This function will separate the columns as described above
def split_pats_type_from_module_location(df):

    # Split the security string considering the '(' as the delimiter
    df['Security'] = df['Security'].str.split('(')

    # Loop to iterate under the df indexes
    for idx in df.index:
        # Method to add a new column to the df, pulling the PATS Type from security column 
        df.at[idx, 'PATS Type'] = f'{df['Security'][idx][0]}'
        # Remove the PATS Type from the security column
        df['Security'][idx].pop(0)

    # Extracts the string from list under security column  
    df_anti_theft_module_location_list = df['Security'].str.join(' ')

    # Update the df
    df['Security'] = df_anti_theft_module_location_list

    # Remove char ')' from column security
    df['Security'] = df['Security'].str.replace(')', '', regex=False)

    # Rename column security to 'Anti-Theft Module Location'
    df = df.rename(columns={'Security': 'Anti-Theft Module Location'})

    # Reorder columns 
    df_updated_reordered_columns = df[['Year', 'Make', 'Model', 'Anti-Theft Module Location', 'PATS Type', 'ParameterReset']]

    # Return the df updated: columns for pats type and anti-theft module location separated
    return df_updated_reordered_columns

In [46]:
# Print the head to confirm the changes
df_lincoln_reordered_columns = split_pats_type_from_module_location(df_lincoln_mks_dropped_old_string)

### Check if the new column is correct has the correct string

In [47]:
data_from_column(df_lincoln_reordered_columns, 'Anti-Theft Module Location')

{'',
 'Body Control Module',
 'Instrument Cluster',
 'Powertrain Control Module',
 'Remote Function Actuator / Keyless Vehicle Module',
 'Stand Alone PATS Module',
 'Steering Column Ignition Lock Module'}

In [48]:
data_from_column(df_lincoln_reordered_columns, 'PATS Type')

{' PATS Type C ',
 'New in 2009',
 'New in 2010',
 'PATS Type A ',
 'PATS Type B ',
 'PATS Type C ',
 'PATS Type D ',
 'PATS Type E '}

In [49]:
# Check if column parameter reset needs to be change
data_from_column(df_lincoln_reordered_columns, 'ParameterReset')

{'Parameter Reset Not Required', 'Parameter Reset Required', nan}

In [51]:
# Check the rows with empty values
df_lincoln_empty_rows = df_lincoln_reordered_columns[df_lincoln_reordered_columns['Anti-Theft Module Location'] == '']

In [52]:
# Print the df with empty values
df_lincoln_empty_rows

Unnamed: 0,Year,Make,Model,Anti-Theft Module Location,PATS Type,ParameterReset
2135,2008,Lincoln,MKS,,New in 2009,
2383,2009,Lincoln,MKT,,New in 2010,


The empty cells in the columns 'Anti-Theft Module Location' and 'ParameterReset' refer to the modules 2008 Lincoln MKS and 2009 Lincoln MKT, which were mistakenly included in the original DataFrame, as these models and years do not exist. This may explain the information 'New in 2009' and 'New in 2010' in the 'PATS Type' column. Therefore, these modules will be removed from the DataFrame.

In [53]:
# Drop non existent models
df_lincoln_dropped_empty_rows = df_lincoln_reordered_columns.drop(index=list(df_lincoln_empty_rows.index))

### Make sure the columns no longer have empty values and non existent models were removed

In [55]:
data_from_column(df_lincoln_dropped_empty_rows, 'Anti-Theft Module Location')

{'Body Control Module',
 'Instrument Cluster',
 'Powertrain Control Module',
 'Remote Function Actuator / Keyless Vehicle Module',
 'Stand Alone PATS Module',
 'Steering Column Ignition Lock Module'}

In [56]:
data_from_column(df_lincoln_dropped_empty_rows, 'PATS Type')

{' PATS Type C ',
 'PATS Type A ',
 'PATS Type B ',
 'PATS Type C ',
 'PATS Type D ',
 'PATS Type E '}

In [57]:
data_from_column(df_lincoln_dropped_empty_rows, 'ParameterReset')

{'Parameter Reset Not Required', 'Parameter Reset Required'}

In [None]:
# Export the to .csv file
# df_lincoln_dropped_empty_rows.to_csv('C:\\Language_Projects\\Language_Projects\\Python\\Flagship_1\\vehicle_security_system_data_cleaning\\data\\df_lincoln.csv', index=False)