# ***Complete Guide to Generative AI for Data Analysis and Data Science***

---

### **Chapter 10:** *Analyzing Data in Files*

a) Reading JSON Files

In [1]:
import json
import pandas as pd

json_file_path = '/content/products.json'
# Load raw JSON first
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Normalize JSON structure - required for nested structures
df = pd.json_normalize(data)

print(df.head())

                       productName productCategory productID  \
0                   myPhone 13 Pro     electronics   ELEC001   
1                     Acme QLED TV     electronics   ELEC002   
2  Ace Noise-Cancelling Headphones     electronics   ELEC003   
3                Cast Iron Skillet      housewares  HOUSE001   
4                   Vacuum Cleaner      housewares  HOUSE002   

  categoryAttributes.displaySize categoryAttributes.cameraResolution  \
0                     6.1 inches                                12MP   
1                            NaN                                 NaN   
2                            NaN                                 NaN   
3                            NaN                                 NaN   
4                            NaN                                 NaN   

  categoryAttributes.storage categoryAttributes.screenSize  \
0                      256GB                           NaN   
1                        NaN                     65 inches

In [2]:
#Summary Statistics
print("\nSummary Statistics:")
print(df.describe(include='all'))


Summary Statistics:
           productName productCategory productID  \
count               10              10        10   
unique              10               3        10   
top     myPhone 13 Pro        clothing   ELEC001   
freq                 1               4         1   
mean               NaN             NaN       NaN   
std                NaN             NaN       NaN   
min                NaN             NaN       NaN   
25%                NaN             NaN       NaN   
50%                NaN             NaN       NaN   
75%                NaN             NaN       NaN   
max                NaN             NaN       NaN   

       categoryAttributes.displaySize categoryAttributes.cameraResolution  \
count                               1                                   1   
unique                              1                                   1   
top                        6.1 inches                                12MP   
freq                                1         

In [3]:
#information
info = df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   productName                            10 non-null     object 
 1   productCategory                        10 non-null     object 
 2   productID                              10 non-null     object 
 3   categoryAttributes.displaySize         1 non-null      object 
 4   categoryAttributes.cameraResolution    1 non-null      object 
 5   categoryAttributes.storage             1 non-null      object 
 6   categoryAttributes.screenSize          1 non-null      object 
 7   categoryAttributes.resolution          1 non-null      object 
 8   categoryAttributes.smartTV             1 non-null      object 
 9   categoryAttributes.wirelessTechnology  1 non-null      object 
 10  categoryAttributes.batteryLife         1 non-null      object 
 11  categoryA

b) Using AI for Data Quality and Data Cleansing

In [4]:
import pandas as pd
import json

# Load JSON file
file_path = "/content/products.json"
df = pd.read_json(file_path)

# --- Data Quality Checks ---
print("Checking for data quality issues...\n")

# 1. Check for missing values
missing_values = df.isnull().sum()
print("Missing Values per column:\n", missing_values, "\n")

# 2. Check for duplicates (handle unhashable data)
df_for_dup = df.copy()

# Convert unhashable types to strings so duplicates() works
for col in df_for_dup.columns:
    df_for_dup[col] = df_for_dup[col].apply(lambda x: json.dumps(x, sort_keys=True) if isinstance(x, dict) else x)

duplicates = df_for_dup[df_for_dup.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}\n")

# 3. Check data types
print("Data types:\n", df.dtypes, "\n")

# 4. Basic stats for numeric columns
print("Summary statistics for numeric columns:\n", df.describe(), "\n")

# --- Data Cleaning ---
print("Fixing data issues...\n")

# Remove duplicates
df_cleaned = df[~df_for_dup.duplicated()]

# Fill missing values (example: fill numeric with mean, text with 'Unknown')
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'O':  # Object/string
        df_cleaned[col].fillna('Unknown', inplace=True)
    else:  # Numeric
        df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)

# Save cleaned file
df_cleaned.to_json("cleaned_data.json", orient="records", indent=4)

print("Data cleaning complete. Cleaned data saved to 'cleaned_data.json'.")

Checking for data quality issues...

Missing Values per column:
 productName           0
productCategory       0
productID             0
categoryAttributes    0
dtype: int64 

Number of duplicate rows: 0

Data types:
 productName           object
productCategory       object
productID             object
categoryAttributes    object
dtype: object 

Summary statistics for numeric columns:
            productName productCategory productID  \
count               10              10        10   
unique              10               3        10   
top     myPhone 13 Pro        clothing   ELEC001   
freq                 1               4         1   

                                       categoryAttributes  
count                                                  10  
unique                                                 10  
top     {'displaySize': '6.1 inches', 'cameraResolutio...  
freq                                                    1   

Fixing data issues...

Data cleaning complete.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna('Unknown', inplace=True)


**Challenge:** <br>
Use sensor dataset and check for missing values and replace the missing values with averages of that column.

In [6]:
import pandas as pd
import numpy as np
# Load CSV file
df = pd.read_csv("/content/10_challenge_missing_data.csv")

# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check missing values per column
print("\nMissing values per column before fixing:")
print(df.isnull().sum())

# Fill missing values in numerical columns with column mean
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Verify missing values are fixed
print("\nMissing values per column after fixing:")
print(df.isnull().sum())

# Save the cleaned data (optional)
df.to_csv("sensor_data_cleaned.csv", index=False)
print("\nCleaned dataset saved to 'sensor_data_cleaned.csv'")

First 5 rows of the dataset:
  sensor_id                  timestamp  temperature_celsius  \
0  sensor_1  2025-07-01 01:00:00+00:00                20.62   
1  sensor_1  2025-07-01 01:01:00+00:00                23.52   
2  sensor_1  2025-07-01 01:02:00+00:00                22.32   
3  sensor_1  2025-07-01 01:03:00+00:00                20.30   
4  sensor_1  2025-07-01 01:04:00+00:00                21.84   

   relative_humidity  pressure_mbar  
0              68.70        1002.92  
1              49.48        1007.01  
2              63.27         993.90  
3              69.39        1008.47  
4              62.61         994.87  

Missing values per column before fixing:
sensor_id              0
timestamp              0
temperature_celsius    3
relative_humidity      4
pressure_mbar          2
dtype: int64

Missing values per column after fixing:
sensor_id              0
timestamp              0
temperature_celsius    0
relative_humidity      0
pressure_mbar          0
dtype: int64

Clea