## Read Data from CSV and Calculate DQI

**Description**: Read data from a CSV file, identify errors as missing values, and calculate the DQI.

In [2]:
# Write your code from here
import pandas as pd

# Function to calculate DQI for individual rows
def calculate_row_dqi(row, completeness_weight=0.4, accuracy_weight=0.3, timeliness_weight=0.3):
    # Define dimensions based on available columns in the dataset (for example, 'completeness', 'accuracy', 'timeliness')
    completeness_score = row.get('completeness', 0)
    accuracy_score = row.get('accuracy', 0)
    timeliness_score = row.get('timeliness', 0)
    
    # Handle missing data: Missing values are treated as errors (score = 0)
    if pd.isna(completeness_score): completeness_score = 0
    if pd.isna(accuracy_score): accuracy_score = 0
    if pd.isna(timeliness_score): timeliness_score = 0
    
    # Calculate the overall DQI for this row
    overall_dqi = (completeness_score * completeness_weight) + \
                  (accuracy_score * accuracy_weight) + \
                  (timeliness_score * timeliness_weight)
    
    return overall_dqi

# Function to calculate the DQI for the entire dataset
def calculate_dataset_dqi(file_path):
    # Read CSV into DataFrame
    df = pd.read_csv(file_path)
    
    # Initialize a list to store individual row DQI scores
    dqi_scores = []
    
    # Iterate over each row in the dataframe and calculate DQI
    for index, row in df.iterrows():
        dqi_scores.append(calculate_row_dqi(row))
    
    # Calculate the average DQI for the entire dataset
    overall_dqi = sum(dqi_scores) / len(dqi_scores) if len(dqi_scores) > 0 else 0
    
    return round(overall_dqi, 1)

# Example usage
file_path = 'your_dataset.csv'  # Replace with the path to your CSV file
overall_dqi = calculate_dataset_dqi(file_path)
print("Overall Data Quality Index (DQI) for the dataset:", overall_dqi)


FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

### Visualize Basic DQI with Bar Plot

**Description**: Create a bar plot for DQI and errors in a dataset.

In [None]:
# Write your code from here
import pandas as pd
import matplotlib.pyplot as plt

# Function to calculate DQI for individual rows (same as before)
def calculate_row_dqi(row, completeness_weight=0.4, accuracy_weight=0.3, timeliness_weight=0.3):
    completeness_score = row.get('completeness', 0)
    accuracy_score = row.get('accuracy', 0)
    timeliness_score = row.get('timeliness', 0)
    
    # Handle missing data: Missing values are treated as errors (score = 0)
    if pd.isna(completeness_score): completeness_score = 0
    if pd.isna(accuracy_score): accuracy_score = 0
    if pd.isna(timeliness_score): timeliness_score = 0
    
    # Calculate the overall DQI for this row
    overall_dqi = (completeness_score * completeness_weight) + \
                  (accuracy_score * accuracy_weight) + \
                  (timeliness_score * timeliness_weight)
    
    return overall_dqi

# Function to count errors in each row (missing values)
def count_errors(row):
    return row.isna().sum()

# Function to read the dataset and visualize DQI and errors
def visualize_dqi_and_errors(file_path):
    # Read CSV into DataFrame
    df = pd.read_csv(file_path)
    
    # Initialize lists to store DQI values and error counts for each row
    dqi_scores = []
    error_counts = []
    
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        dqi_scores.append(calculate_row_dqi(row))
        error_counts.append(count_errors(row))
    
    # Create a DataFrame for visualization
    results_df = pd.DataFrame({'DQI': dqi_scores, 'Errors': error_counts})
    
    # Plotting the DQI and Errors as bar plots
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Bar plot for DQI
    ax.bar(results_df.index - 0.2, results_df['DQI'], width=0.4, label='DQI', color='skyblue')
    
    # Bar plot for Errors
    ax.bar(results_df.index + 0.2, results_df['Errors'], width=0.4, label='Errors', color='orange')
    
    # Labeling the plot
    ax.set_xlabel('Row Index')
    ax.set_ylabel('Scores / Errors')
    ax.set_title('DQI and Errors in Dataset')
    ax.legend()
    
    # Show the plot
    plt.tight_layout()
    plt.show()

# Example usage
file_path = 'your_dataset.csv'  # Replace with the path to your CSV file
visualize_dqi_and_errors(file_path)
