In [1]:
import pandas as pd
import os

In [2]:
# This function creates the path to the main Excel file of this project
# Uses os library methods to ensure the file path works in both local and GitHub CI/CD environments
def create_file_path(file_relative_path):
    # Store the xlsx file dir into a var
    dir = '../data'
    
    # Iterate under dir data to have all files
    for filename in os.listdir(dir):
        # Condition to confirm which file is == year_make_model_df.xlsx
        if filename.endswith(file_relative_path):
            # Join the dir + filename to create the path to year_make_model_df.xlsx file  
            xlsx_file_fullpath = os.path.join(dir, filename)
            # Print the result
            print(xlsx_file_fullpath)
    return xlsx_file_fullpath

In [3]:
# Read excel 
df = pd.read_excel(create_file_path('year_make_model_df.xlsx'))

../data\year_make_model_df.xlsx


In [4]:
# Create df
df_vw = df[df["Make"] == "Volkswagen"]

In [5]:
# Check how the df is
df_vw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135 entries, 644 to 3419
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            135 non-null    int64 
 1   Make            135 non-null    object
 2   Model           135 non-null    object
 3   Security        133 non-null    object
 4   ParameterReset  0 non-null      object
dtypes: int64(1), object(4)
memory usage: 6.3+ KB


In [6]:
# Function to create list with unique items from given column
def data_from_column(df, column):
    # List to append the items
    data_column_list =[]

    # Iterate over given column
    for data in df[column]:
        # Append the items from the given column
        data_column_list.append(data)

    # Return a set with unique items from given column
    return set(data_column_list)

In [7]:
# Call the function to create a set with unique items from column model
data_from_column(df_vw, "Model")

{'CC/Passat CC',
 'Cabrio',
 'City Golf Mk4.5',
 'City Jetta Mk4.5',
 'Eos',
 'Euro Van',
 'GTI',
 'Golf A5',
 'Golf City Mk4.5',
 'Golf Mk4',
 'Jetta (NF)',
 'Jetta A5',
 'Jetta City Mk4.5',
 'Jetta Mk4',
 'New Beetle',
 'Passat “NMS”',
 'Phaeton',
 'R32',
 'Rabbit',
 'Routan',
 'Tiguan',
 'Touareg'}

CHANGES NEEDED:
- Remove the charactere "CC/" from the model "CC/Passat CC"

In [18]:
# Remove the charactere "CC/" from the model "CC/Passat CC"
df_vw['Model'] = df_vw['Model'].str.replace('CC/Passat CC', 'Passat CC', regex=True) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vw['Model'] = df_vw['Model'].str.replace('CC/Passat CC', 'Passat CC', regex=True)


In [19]:
# Call the function to create a set with unique items from column Security
data_from_column(df_vw, "Security")

{'Gen II',
 'Gen III',
 'Gen III\nor IV',
 'Gen III or IV',
 'Gen IV',
 'Gen IV/ Gen\nV**',
 'Gen V',
 'WIN',
 nan}

CHANGES NEEDED:
- Remove/replace the charactere '\n' for space ' '
- Fill out null values.

In [20]:
# Remove/replace the charactere '\n' for space ' '
df_vw['Security'] = df_vw['Security'].str.replace('\n', ' ', regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vw['Security'] = df_vw['Security'].str.replace('\n', ' ', regex=False)


In [21]:
# Fill out null values
df_vw['Security'].fillna('Information not available', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_vw['Security'].fillna('Information not available', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vw['Security'].fillna('Information not available', inplace=True)


In [22]:
# Drop the column ParameterReset
df_vw_dropped_column = df_vw.drop(columns=['ParameterReset'])

In [23]:
# Reset index
df_vw_reset_index = df_vw_dropped_column.reset_index(drop=True)

In [None]:
# Export the to .csv file
# df_vw_reset_index.to_csv('C:\\Language_Projects\\Language_Projects\\Python\\Flagship_1\\vehicle_security_system_data_cleaning\\data\\df_vw.csv', index=False)