In [None]:
##Dataset Name: Synthetic Earthquake Dataset
##Dataset Link: https://www.kaggle.com/datasets/synthetic-earthquake-data

## What One Row Represents---> Each row represents one recorded earthquake event.
It contains information about the time, location, magnitude, depth, and type of the earthquake.
## Explanation of Columns (Any 6)

1.time
Indicates the date and time when the earthquake occurred.

2.latitude
Shows the north–south position of the earthquake epicenter.

3.longitude
Shows the east–west position of the earthquake epicenter.

4.depth
Represents the depth of the earthquake in kilometers below the Earth’s surface.

5.mag (Magnitude)
Measures the intensity or strength of the earthquake on a numerical scale.

6.place
Describes the geographical location or region where the earthquake occurred.
##Expected Data Issues 

1.Missing or Null Values
Some columns like magnitude or depth may contain missing values, which can affect analysis and require proper handling.

2.Inconsistent Text Formatting
The place column may contain extra spaces or inconsistent letter casing (upper/lower case), which needs cleaning before analysis.

In [47]:
##CATEGORY 1--> IMPORTING & DATASET LOADING
import pandas as pd
import numpy as np
 
df=pd.read_csv(r"C:\Users\Administrator\Downloads\synthetic_online_retail_data.csv")
df

Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
0,13542,12/17/2024,784,10,Electronics,Smartphone,2,373.36,Credit Card,New Oliviaberg,1.0,F,56
1,23188,6/1/2024,682,50,Sports & Outdoors,Soccer Ball,5,299.34,Credit Card,Port Matthew,,M,59
2,55098,2/4/2025,684,50,Sports & Outdoors,Tent,5,23.00,Credit Card,West Sarah,5.0,F,64
3,65208,10/28/2024,204,40,Books & Stationery,Story Book,2,230.11,Bank Transfer,Hernandezburgh,5.0,M,34
4,63872,5/10/2024,202,20,Fashion,Skirt,4,176.72,Credit Card,Jenkinshaven,1.0,F,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,67967,5/4/2024,965,40,Books & Stationery,Notebook,3,495.24,Cash on Delivery,Hodgemouth,,,30
996,99828,9/12/2024,510,40,Books & Stationery,Story Book,5,427.73,Credit Card,Douglastown,3.0,F,72
997,92290,11/6/2024,445,10,Electronics,Smartphone,5,354.64,Bank Transfer,New Amberville,,M,49
998,61427,9/17/2024,410,10,Electronics,Laptop,4,221.54,Cash on Delivery,New Sean,3.0,M,71


In [48]:
df.shape

(1000, 13)

In [49]:
df.head()


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
0,13542,12/17/2024,784,10,Electronics,Smartphone,2,373.36,Credit Card,New Oliviaberg,1.0,F,56
1,23188,6/1/2024,682,50,Sports & Outdoors,Soccer Ball,5,299.34,Credit Card,Port Matthew,,M,59
2,55098,2/4/2025,684,50,Sports & Outdoors,Tent,5,23.0,Credit Card,West Sarah,5.0,F,64
3,65208,10/28/2024,204,40,Books & Stationery,Story Book,2,230.11,Bank Transfer,Hernandezburgh,5.0,M,34
4,63872,5/10/2024,202,20,Fashion,Skirt,4,176.72,Credit Card,Jenkinshaven,1.0,F,33


In [50]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer_id     1000 non-null   int64  
 1   order_date      1000 non-null   object 
 2   product_id      1000 non-null   int64  
 3   category_id     1000 non-null   int64  
 4   category_name   1000 non-null   object 
 5   product_name    1000 non-null   object 
 6   quantity        1000 non-null   int64  
 7   price           1000 non-null   float64
 8   payment_method  1000 non-null   object 
 9   city            1000 non-null   object 
 10  review_score    799 non-null    float64
 11  gender          897 non-null    object 
 12  age             1000 non-null   int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 101.7+ KB


In [51]:
#Missing Values
df.isna().sum()

customer_id         0
order_date          0
product_id          0
category_id         0
category_name       0
product_name        0
quantity            0
price               0
payment_method      0
city                0
review_score      201
gender            103
age                 0
dtype: int64

In [52]:
# Duplicate Rows
df.duplicated().sum()


np.int64(0)

In [53]:
print(df.dtypes)

customer_id         int64
order_date         object
product_id          int64
category_id         int64
category_name      object
product_name       object
quantity            int64
price             float64
payment_method     object
city               object
review_score      float64
gender             object
age                 int64
dtype: object


In [55]:
#Fix Datatype – Convert order_date
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df['order_date'] 

0     2024-12-17
1     2024-06-01
2     2025-02-04
3     2024-10-28
4     2024-05-10
         ...    
995   2024-05-04
996   2024-09-12
997   2024-11-06
998   2024-09-17
999   2024-11-06
Name: order_date, Length: 1000, dtype: datetime64[ns]

In [24]:
df[['city', 'payment_method']].head()

Unnamed: 0,city,payment_method
0,New Oliviaberg,Credit Card
1,Port Matthew,Credit Card
2,West Sarah,Credit Card
3,Hernandezburgh,Bank Transfer
4,Jenkinshaven,Credit Card


In [56]:
#Fill Missing Values (Method 1 – Mean)
df['review_score'] = df['review_score'].fillna(df['review_score'].mean())
df['review_score'] 

0      1.000000
1      3.992491
2      5.000000
3      5.000000
4      1.000000
         ...   
995    3.992491
996    3.000000
997    3.992491
998    3.000000
999    1.000000
Name: review_score, Length: 1000, dtype: float64

In [57]:
#Text Cleaning – strip(
df['city'] = df['city'].str.strip()
df['city']

0      New Oliviaberg
1        Port Matthew
2          West Sarah
3      Hernandezburgh
4        Jenkinshaven
            ...      
995        Hodgemouth
996       Douglastown
997    New Amberville
998          New Sean
999      North Kelsey
Name: city, Length: 1000, dtype: object

In [58]:
#Text Cleaning – casing
df['city'] = df['city'].str.title()
df['city']

0      New Oliviaberg
1        Port Matthew
2          West Sarah
3      Hernandezburgh
4        Jenkinshaven
            ...      
995        Hodgemouth
996       Douglastown
997    New Amberville
998          New Sean
999      North Kelsey
Name: city, Length: 1000, dtype: object

In [60]:
#Fix Categorical Text Consistency
df['payment_method'] = df['payment_method'].str.lower()
df['payment_method']

0           credit card
1           credit card
2           credit card
3         bank transfer
4           credit card
             ...       
995    cash on delivery
996         credit card
997       bank transfer
998    cash on delivery
999    cash on delivery
Name: payment_method, Length: 1000, dtype: object

In [61]:
#Remove Invalid Quantity Values
df = df[df['quantity'] > 0]
df

Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,price,payment_method,city,review_score,gender,age
0,13542,2024-12-17,784,10,Electronics,Smartphone,2,373.36,credit card,New Oliviaberg,1.000000,F,56
1,23188,2024-06-01,682,50,Sports & Outdoors,Soccer Ball,5,299.34,credit card,Port Matthew,3.992491,M,59
2,55098,2025-02-04,684,50,Sports & Outdoors,Tent,5,23.00,credit card,West Sarah,5.000000,F,64
3,65208,2024-10-28,204,40,Books & Stationery,Story Book,2,230.11,bank transfer,Hernandezburgh,5.000000,M,34
4,63872,2024-05-10,202,20,Fashion,Skirt,4,176.72,credit card,Jenkinshaven,1.000000,F,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,67967,2024-05-04,965,40,Books & Stationery,Notebook,3,495.24,cash on delivery,Hodgemouth,3.992491,,30
996,99828,2024-09-12,510,40,Books & Stationery,Story Book,5,427.73,credit card,Douglastown,3.000000,F,72
997,92290,2024-11-06,445,10,Electronics,Smartphone,5,354.64,bank transfer,New Amberville,3.992491,M,49
998,61427,2024-09-17,410,10,Electronics,Laptop,4,221.54,cash on delivery,New Sean,3.000000,M,71


In [63]:
#Rename Columns for Clarity
df.rename(columns={'price': 'unit_price'}, inplace=True)
df.rename

<bound method DataFrame.rename of      customer_id order_date  product_id  category_id       category_name  \
0          13542 2024-12-17         784           10         Electronics   
1          23188 2024-06-01         682           50   Sports & Outdoors   
2          55098 2025-02-04         684           50   Sports & Outdoors   
3          65208 2024-10-28         204           40  Books & Stationery   
4          63872 2024-05-10         202           20             Fashion   
..           ...        ...         ...          ...                 ...   
995        67967 2024-05-04         965           40  Books & Stationery   
996        99828 2024-09-12         510           40  Books & Stationery   
997        92290 2024-11-06         445           10         Electronics   
998        61427 2024-09-17         410           10         Electronics   
999        20658 2024-11-06         177           40  Books & Stationery   

    product_name  quantity  unit_price    payment_met

In [64]:
#Feature Engineering – Create 3 New Columns

#Column 1: Total Amount
df['total_amount'] = df['quantity'] * df['unit_price']
df['total_amount']

0       746.72
1      1496.70
2       115.00
3       460.22
4       706.88
        ...   
995    1485.72
996    2138.65
997    1773.20
998     886.16
999     590.91
Name: total_amount, Length: 1000, dtype: float64

In [65]:
#Column 2: Customer Age Group (Conditional)
df['age_group'] = np.where(
    df['age'] < 30, 'Young',
    np.where(df['age'] < 50, 'Middle-aged', 'Senior')
)
df['age_group']

0           Senior
1           Senior
2           Senior
3      Middle-aged
4      Middle-aged
          ...     
995    Middle-aged
996         Senior
997    Middle-aged
998         Senior
999    Middle-aged
Name: age_group, Length: 1000, dtype: object

In [66]:
df['order_date'].dtype


dtype('<M8[ns]')

In [34]:
object


object

In [67]:
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df['order_date']

0     2024-12-17
1     2024-06-01
2     2025-02-04
3     2024-10-28
4     2024-05-10
         ...    
995   2024-05-04
996   2024-09-12
997   2024-11-06
998   2024-09-17
999   2024-11-06
Name: order_date, Length: 1000, dtype: datetime64[ns]

In [36]:
df['order_date'].dtype


dtype('<M8[ns]')

In [68]:
#Column 3 - Order year
df['order_year'] = df['order_date'].dt.year
df['order_year']

0      2024
1      2024
2      2025
3      2024
4      2024
       ... 
995    2024
996    2024
997    2024
998    2024
999    2024
Name: order_year, Length: 1000, dtype: int32

In [38]:
#Insights Generation
#Insight 1: Average Spending by Gender(groupby + agg)
df.groupby('gender')['total_amount'].agg(['mean', 'sum'])


Unnamed: 0_level_0,mean,sum
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,753.985568,331753.65
M,730.62186,333894.19


In [39]:
#Insight 2: Top 10 Highest Revenue Products
df.sort_values(by='total_amount', ascending=False).head(10)


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,unit_price,payment_method,city,review_score,gender,age,total_amount,age_group,order_year
523,41613,2025-02-21,159,30,Home & Living,Vase,5,487.53,credit card,Port Melissaborough,4.0,F,19,2437.65,Young,2025
841,42739,2024-09-29,149,10,Electronics,Laptop,5,486.9,credit card,Jenniferville,5.0,M,43,2434.5,Middle-aged,2024
532,61358,2025-02-26,439,20,Fashion,T-shirt,5,476.53,bank transfer,Port Paul,4.0,F,51,2382.65,Senior,2025
136,83687,2024-08-26,115,50,Sports & Outdoors,Tent,5,475.89,cash on delivery,Melanieborough,5.0,F,52,2379.45,Senior,2024
476,59486,2024-11-29,327,30,Home & Living,Vase,5,475.62,bank transfer,Brownbury,4.0,F,61,2378.1,Senior,2024
97,77823,2024-09-27,994,10,Electronics,Smartphone,5,475.31,credit card,South Roger,5.0,F,34,2376.55,Middle-aged,2024
792,24175,2025-03-19,122,40,Books & Stationery,Pen,5,470.43,credit card,Frazierbury,4.0,M,25,2352.15,Young,2025
734,99026,2024-08-18,665,20,Fashion,Skirt,5,470.2,credit card,West Antonio,1.0,F,34,2351.0,Middle-aged,2024
353,20393,2025-02-28,133,40,Books & Stationery,Notebook,5,468.55,credit card,South Elizabethport,2.0,M,68,2342.75,Senior,2025
785,27462,2024-09-08,864,10,Electronics,Smartwatch,5,466.38,bank transfer,Johnfort,3.0,M,67,2331.9,Senior,2024


In [40]:
#Insight 3: Most Used Payment Methods(value_counts)
df['payment_method'].value_counts()


payment_method
cash on delivery    374
bank transfer       322
credit card         304
Name: count, dtype: int64

In [41]:
#Insight 4: Category-wise Sales Summary(pivot_table)
pd.pivot_table(
    df,
    values='total_amount',
    index='category_name',
    aggfunc='sum'
)


Unnamed: 0_level_0,total_amount
category_name,Unnamed: 1_level_1
Books & Stationery,143215.52
Electronics,166510.34
Fashion,134714.61
Home & Living,138540.15
Sports & Outdoors,154346.26


In [42]:
#Insight 5: High-Value Orders Only(Boolean Filtering)
df[df['total_amount'] > 1000]


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,unit_price,payment_method,city,review_score,gender,age,total_amount,age_group,order_year
1,23188,2024-06-01,682,50,Sports & Outdoors,Soccer Ball,5,299.34,credit card,Port Matthew,3.992491,M,59,1496.70,Senior,2024
6,79809,2024-06-07,706,10,Electronics,Tablet,5,272.75,bank transfer,North Jessicabury,3.992491,M,57,1363.75,Senior,2024
8,41394,2024-12-17,549,30,Home & Living,Pillow,3,429.11,credit card,West Larrymouth,3.000000,F,69,1287.33,Senior,2024
11,74067,2024-06-27,773,40,Books & Stationery,Notebook,3,494.87,cash on delivery,Melanieberg,5.000000,M,37,1484.61,Middle-aged,2024
13,61360,2024-12-03,544,10,Electronics,Laptop,3,457.44,cash on delivery,Myershaven,5.000000,M,60,1372.32,Senior,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,33859,2025-01-26,712,20,Fashion,Pants,4,334.68,credit card,Laurietown,5.000000,F,25,1338.72,Young,2025
993,94141,2024-09-01,701,20,Fashion,Skirt,4,270.32,bank transfer,North Jeannemouth,5.000000,M,48,1081.28,Middle-aged,2024
995,67967,2024-05-04,965,40,Books & Stationery,Notebook,3,495.24,cash on delivery,Hodgemouth,3.992491,,30,1485.72,Middle-aged,2024
996,99828,2024-09-12,510,40,Books & Stationery,Story Book,5,427.73,credit card,Douglastown,3.000000,F,72,2138.65,Senior,2024


In [43]:
#Final Clean Dataset Preview
df.head()


Unnamed: 0,customer_id,order_date,product_id,category_id,category_name,product_name,quantity,unit_price,payment_method,city,review_score,gender,age,total_amount,age_group,order_year
0,13542,2024-12-17,784,10,Electronics,Smartphone,2,373.36,credit card,New Oliviaberg,1.0,F,56,746.72,Senior,2024
1,23188,2024-06-01,682,50,Sports & Outdoors,Soccer Ball,5,299.34,credit card,Port Matthew,3.992491,M,59,1496.7,Senior,2024
2,55098,2025-02-04,684,50,Sports & Outdoors,Tent,5,23.0,credit card,West Sarah,5.0,F,64,115.0,Senior,2025
3,65208,2024-10-28,204,40,Books & Stationery,Story Book,2,230.11,bank transfer,Hernandezburgh,5.0,M,34,460.22,Middle-aged,2024
4,63872,2024-05-10,202,20,Fashion,Skirt,4,176.72,credit card,Jenkinshaven,1.0,F,33,706.88,Middle-aged,2024
