In [1]:
import pandas as pd
import os

In [2]:
# This function creates the path to the main Excel file of this project
# Uses os library methods to ensure the file path works in both local and GitHub CI/CD environments
def create_file_path(file_relative_path):
    # Store the xlsx file dir into a var
    dir = '../data'
    
    # Iterate under dir data to have all files
    for filename in os.listdir(dir):
        # Condition to confirm which file is == year_make_model_df.xlsx
        if filename.endswith(file_relative_path):
            # Join the dir + filename to create the path to year_make_model_df.xlsx file  
            xlsx_file_fullpath = os.path.join(dir, filename)
            # Print the result
            print(xlsx_file_fullpath)
    return xlsx_file_fullpath

In [3]:
# Read excel 
df = pd.read_excel(create_file_path('year_make_model_df.xlsx'))

../data\year_make_model_df.xlsx


In [4]:
# Create df
df_lexus = df[df["Make"] == "Lexus"]

In [5]:
# Check if there are null values
df_lexus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 196 to 3331
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            134 non-null    int64 
 1   Make            134 non-null    object
 2   Model           134 non-null    object
 3   Security        125 non-null    object
 4   ParameterReset  0 non-null      object
dtypes: int64(1), object(4)
memory usage: 6.3+ KB


In [76]:
# Function to create list with unique items from given column
def data_from_column(df, column):
    # List to append the items
    data_column_list =[]

    # Iterate over given column
    for data in df[column]:
        # Append the items from the given column
        data_column_list.append(data)

    # Return a set with unique items from given column
    return set(data_column_list)

In [77]:
# Call the function to create a set with unique items from column model
data_from_column(df_lexus, "Model")

{'ES', 'GS', 'GX', 'IS', 'IS/ISF', 'ISF', 'LFA', 'LS', 'LX', 'RX', 'SC'}

The list above shows models 'IS/ISF' in the same row for some reason, but they should be splitted in different lines.

In [78]:
# Separare/split models 'IS/ISF' in  columns
df_lexus = df_lexus.assign(Model=df_lexus['Model'].str.split('/')).explode('Model')

In [79]:
# Call the function to create a set with the models again after changes
data_from_column(df_lexus, "Model")

{'ES', 'GS', 'GX', 'IS', 'ISF', 'LFA', 'LS', 'LX', 'RX', 'SC'}

In [80]:
# Call the function to create a set with unique items from column security
data_from_column(df_lexus, "Security")

{'Smart\nKey', 'Smart\nKey Std', 'Smart Key', 'Smart Key Std', 'Std', nan}

### CHANGES NEEDED:
- Fill the rows with null values 'nan'
- Remove the caractere '\n'

In [81]:
# Fill the rows with null values from column 'Security'
df_lexus['Security'].fillna('Information not available', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_lexus['Security'].fillna('Information not available', inplace=True)


In [82]:
# Confirm if the rows w/ null values were filled out
df_lexus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 196 to 3331
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            144 non-null    int64 
 1   Make            144 non-null    object
 2   Model           144 non-null    object
 3   Security        144 non-null    object
 4   ParameterReset  0 non-null      object
dtypes: int64(1), object(4)
memory usage: 6.8+ KB


In [83]:
# Remove the caractere '\n'
df_lexus['Security'] = df_lexus['Security'].str.replace("\n", " ", regex=False)

In [84]:
# Call the function to create a set to confirm the changes
data_from_column(df_lexus, "Security")

{'Information not available', 'Smart Key', 'Smart Key Std', 'Std'}

In [85]:
# Drop columns 'ParameterReset' and 'index' w/ the old indexes
df_lexus_updated = df_lexus.drop(['ParameterReset'], axis=1)

In [86]:
# Reset the index
df_lexus_final_ver = df_lexus_updated.reset_index(drop=True)

In [87]:
# Function to check if there are duplicated values
def check_duplicated_values(df):
    # Create a column with bool values showing whether or not there are duplicated values
    df['Dups'] = df.duplicated()
    # Return bool values counted
    return df['Dups'].value_counts()  

In [88]:
# Call the funtion to check duplicated values
check_duplicated_values(df_lexus_final_ver)

Dups
False    140
True       4
Name: count, dtype: int64

In [91]:
# Remove duplicated years
df_lexus_final_ver.drop_duplicates(subset=['Year', 'Model'], inplace=True)

In [None]:
# Check the info
df_lexus_final_ver.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, 0 to 143
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Year      140 non-null    int64 
 1   Make      140 non-null    object
 2   Model     140 non-null    object
 3   Security  140 non-null    object
 4   Dups      140 non-null    bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 5.6+ KB


In [None]:
# Export the to .csv file
# df_lexus_final_ver.to_csv('C:\\Language_Projects\\Language_Projects\\Python\\Flagship_1\\vehicle_security_system_data_cleaning\\data\\df_lexus.csv', index=False)