In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

ModuleNotFoundError: No module named 'ydata_profiling'

In [None]:
# Define the list of columns to be read from the CSV file
rate_fields = ['StateCode','PlanId','IndividualRate','PrimarySubscriberAndThreeOrMoreDependents','BusinessYear','Age','RatingAreaId','Tobacco','SourceName']
# Read the CSV in chunks for efficient memory usage
rate_chunks = pd.read_csv("/Users/muskan/Documents/SOEN 6111/Rate.csv",iterator=True, chunksize=1000, usecols=rate_fields)
# Concatenating all chunks into a single DataFrame
rates_concat = pd.concat(chunk for chunk in rate_chunks)

In [None]:
# Display the first 5 rows to check the data
rates_concat.head(5)

In [None]:
# Listing column names of the DataFrame
rates_concat.columns

In [None]:
# Prepare the DataFrame for further manipulation
df = rates_concat
# Create a new column 'tobacco_rate'. If 'Tobacco' column is not 'Tobacco User/Non-Tobacco User', copy 'IndividualRate';
# otherwise, increase 'IndividualRate' by 3% to simulate a tobacco surcharge.
df['tobacco_rate'] = np.where(df['Tobacco']!= 'Tobacco User/Non-Tobacco User ',df['IndividualRate'],df['IndividualRate']*1.03)
# Display the DataFrame to verify changes
df.head()

In [None]:
# Data cleaning for 'Age' column:
# Assign -5 to 'Family Option' to mark family plans distinctively.
# Replace '0-20' with 10 to represent minors as a single group.
# Assign 100 to '65 and over' to distinctly represent seniors.
df['Age'] = np.where(df['Age']== 'Family Option',-5,df['Age']) # Family Plan = -5
df['Age'] = np.where(df['Age']== '0-20',10,df['Age'])  # Minor = 10 below 20 years of age
df['Age'] = np.where(df['Age']== '65 and over',100,df['Age'])  # Old age = 100 more than 65 years of age


In [None]:
# Display the DataFrame to verify changes
df.head()

In [None]:
# Display the shape of the DataFrame to understand its size
df.shape

In [None]:
# Filling missing values in 'PrimarySubscriberAndThreeOrMoreDependents' column with 0
df['PrimarySubscriberAndThreeOrMoreDependents']=df['PrimarySubscriberAndThreeOrMoreDependents'].fillna(0) 

In [None]:
# Adjust 'tobacco_rate' for entries with dependents, using the value in 'PrimarySubscriberAndThreeOrMoreDependents' if greater than 0.
df['tobacco_rate'] = np.where(df['PrimarySubscriberAndThreeOrMoreDependents'] > 0,df['PrimarySubscriberAndThreeOrMoreDependents'], df['tobacco_rate'])
# Normalize 'PrimarySubscriberAndThreeOrMoreDependents' to binary values, 1 if there are dependents and 0 otherwise.
df['PrimarySubscriberAndThreeOrMoreDependents'] = np.where(df['PrimarySubscriberAndThreeOrMoreDependents'] > 0,1, df['PrimarySubscriberAndThreeOrMoreDependents'])
df

In [None]:
# Convert 'Age' to integer type for consistency in data types.
df.Age = df.Age.astype(int)

In [None]:
# Generate a profile report for exploratory data analysis, capturing insights about the dataset.
profile_final = ProfileReport(df, title="Profiling Report Rates CSV", explorative=True)

# Saving the Profile Report as an HTML file
profile_final.to_file("Rate_Cleaned_Output.html")

In [None]:

# Filter rows where 'Tobacco' is 'Tobacco User/Non-Tobacco User' for further manipulation.
df_with_filters = df[df['Tobacco'] == 'Tobacco User/Non-Tobacco User']

# Create copies of the filtered DataFrame to differentiate between tobacco users and non-users.
df_first_half = df_with_filters.copy()
df_second_half = df_with_filters.copy()

# For the first copy, set 'Tobacco' to 'Tobacco User' for all rows to categorize these entries explicitly.
df_first_half.loc[df_first_half.index[:len(df_with_filters)], 'Tobacco'] = 'Tobacco User'

# For the second copy, change 'Tobacco' to 'Non-Tobacco User', creating a clear distinction.
df_second_half.loc[df_first_half.index[:len(df_with_filters)], 'Tobacco'] = 'Non-Tobacco User'

# For 'Non-Tobacco User' rows, align 'tobacco_rate' with 'IndividualRate', effectively undoing the previous surcharge.
df_second_half.loc[df_first_half['Tobacco'] == 'Non-Tobacco User', 'tobacco_rate'] = df_second_half.loc[df_first_half['Tobacco'] == 'Non-Tobacco User', 'IndividualRate']

# Combine the modified DataFrames with the rest of the dataset, reintegrating the separated tobacco user statuses.
df_concat = pd.concat([df_first_half,df_second_half])
df_processed = pd.concat([df[df['Tobacco'] != 'Tobacco User/Non-Tobacco User'], df_concat])

# Reset the index of the final DataFrame for consistency and ease of data manipulation.
df_processed = df_processed.reset_index(drop=True)

# Display the modified DataFrame for verification.
df_processed

In [None]:
# Save the cleaned and combined DataFrame to a CSV file for future use.
df_processed.to_csv('Rate_Cleaned.csv')

In [None]:
# Display the head of the final DataFrame to confirm the structure and changes.
df_processed.head()