In [1]:
import pandas as pd
import os

In [2]:
# This function creates the path to the main Excel file of this project
# Uses os library methods to ensure the file path works in both local and GitHub CI/CD environments
def create_file_path(file_relative_path):
    # Store the xlsx file dir into a var
    dir = '../data'
    
    # Iterate under dir data to have all files
    for filename in os.listdir(dir):
        # Condition to confirm which file is == year_make_model_df.xlsx
        if filename.endswith(file_relative_path):
            # Join the dir + filename to create the path to year_make_model_df.xlsx file  
            xlsx_file_fullpath = os.path.join(dir, filename)
            # Print the result
            print(xlsx_file_fullpath)
    return xlsx_file_fullpath

In [3]:
# Read excel
df = pd.read_excel(create_file_path("year_make_model_df.xlsx"))

../data\year_make_model_df.xlsx


In [4]:
# Create audi df
df_audi = df[df["Make"] == "Audi"]

In [5]:
# Function to create list with unique items from given column
def data_from_column(df, column):
    # List to append the items
    data_column_list =[]

    # Iterate over given column
    for data in df[column]:
        # Append the items from the given column
        data_column_list.append(data)

    # Return a set with unique items from given column
    return set(data_column_list)

In [6]:
# Check how the df is
df_audi.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101 entries, 508 to 3192
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            101 non-null    int64 
 1   Make            101 non-null    object
 2   Model           101 non-null    object
 3   Security        97 non-null     object
 4   ParameterReset  0 non-null      object
dtypes: int64(1), object(4)
memory usage: 4.7+ KB


In [6]:
# Call the function and print the items from column models
print(data_from_column(df_audi, "Model"))

{'TT', 'S4', 'Q5', 'Q7', 'Allroad', 'A7', 'R8', 'A8/S8', 'RS4', 'A4 (all models)', 'A3', 'A6/S6 (all models)', 'A5'}


Changes needed:
- Split the models 'A6/S6 (all models)' and 'A8/S8' in 2 separated lines. A6, S6, A8 and S8 are different models.

In [7]:
# Split the models 'A6/S6 (all models)' and 'A8/S8' in 2 separated lines
df_audi_change_models = df_audi.assign(Model=df_audi['Model'].str.split('/')).explode('Model')

In [8]:
# Call the function to create the list with the models after updating
print(data_from_column(df_audi_change_models, 'Model'))

{'A8', 'S4', 'TT', 'Q7', 'Allroad', 'Q5', 'S6 (all models)', 'R8', 'RS4', 'A7', 'A4 (all models)', 'S8', 'A3', 'A5', 'A6'}


In [9]:
# As the model A6 had the string (all models), replace the string A6 for 'A6 (all models)
df_audi_change_models['Model'] = df_audi_change_models['Model'].str.replace('A6', 'A6 (all models)', regex=True)

In [10]:
# Call the function and print the items from column security
print(data_from_column(df_audi, "Security"))

{'Gen III', nan, 'Gen V', 'Gen IIIA', 'Gen IV'}


In [11]:
# No changes needed, just fill out the null values
df_audi_change_models['Security'].fillna('Information not available', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_audi_change_models['Security'].fillna('Information not available', inplace=True)


In [12]:
# Drop column ParameterReset
df_audi_drop_pr_column = df_audi_change_models.drop(columns=['ParameterReset'])

In [13]:
# Reset index
df_audi_updated = df_audi_drop_pr_column.reset_index(drop=True)

'A6/S6 (all models)' and 'A8/S8'

In [14]:
# Function to check if there are duplicated values
def check_duplicated_values(df):
    # Create a column with bool values showing whether or not there are duplicated values
    df['Dups'] = df.duplicated()
    # Return bool values counted
    return df['Dups'].value_counts()  

In [15]:
# Call the function to check dup values
check_duplicated_values(df_audi_updated)

Dups
False    127
Name: count, dtype: int64

In [None]:
# Export the to .csv file
# df_audi_updated.to_csv('C:\\Language_Projects\\Language_Projects\\Python\\Flagship_1\\vehicle_security_system_data_cleaning\\data\\df_audi.csv', index=False)