# Create telecom_usage.csv

In [3]:
import pandas as pd

In [4]:
raw = """customer_id,data_used_gb,calls_made,revenue_inr,region,date
1001,5.2,25,180,Delhi,2025/09/25
1002,,40,280,Mumbai,2025-09-25
1003,7.8,32,210,chennai,25-09-2025
1004,15.6,55,,DELHI,2025-09-25
1005,3.4,18,120,Kolkata,2025-09-25
1005,3.4,18,120,Kolkata,2025-09-25
1006,8.7,52,150,Jabalpur,2025-09-25
1007,5.8,22,135,Bhopal,2025-09-25
1008,6.5,40,180,Gwailor,2025-09-25
1009,11.7,100,350,Indore,2025-09-25
1010,2.5,23,105,Chandigarh,2025-09-25
1011,9.5,50,190,Ambala,2025-09-25
1012,12.1,80,270,Bangalore,2025-09-25
1013,,37,205,Gurgaon,2025-09-25
1014,4.8,43,132,Hyderbad,2025-09-25
"""

In [5]:
open("telecom_usage.csv","w").write(raw)
pd.read_csv("telecom_usage.csv").head()

Unnamed: 0,customer_id,data_used_gb,calls_made,revenue_inr,region,date
0,1001,5.2,25,180.0,Delhi,2025/09/25
1,1002,,40,280.0,Mumbai,2025-09-25
2,1003,7.8,32,210.0,chennai,25-09-2025
3,1004,15.6,55,,DELHI,2025-09-25
4,1005,3.4,18,120.0,Kolkata,2025-09-25


# Load the CSV

In [6]:
df = pd.read_csv("telecom_usage.csv")
print("Original shape:", df.shape) # (rows, columns)
df.head()

Original shape: (15, 6)


Unnamed: 0,customer_id,data_used_gb,calls_made,revenue_inr,region,date
0,1001,5.2,25,180.0,Delhi,2025/09/25
1,1002,,40,280.0,Mumbai,2025-09-25
2,1003,7.8,32,210.0,chennai,25-09-2025
3,1004,15.6,55,,DELHI,2025-09-25
4,1005,3.4,18,120.0,Kolkata,2025-09-25


# Handle missing values

In [7]:
before = len(df)
df = df.dropna() # removes rows with any blank
after = len(df)
print(f"Removed {before - after} rows with missing values.")
# df = df.dropna(subset=["data_used_in_GB","date"])

Removed 3 rows with missing values.


In [8]:
df.to_csv("missing_telecom_usage.csv", index=False)
print("Saved missing_telecom_usage.csv with", len(df), "rows.")

Saved missing_telecom_usage.csv with 12 rows.


# Standardise the date format

In [9]:
df["date"] = pd.to_datetime(df["date"], errors="coerce") # coerce turns bad dates into NaT
bad_dates = df["date"].isna().sum()
print("Unparseable date rows:", bad_dates)
df = df.dropna(subset=["date"]) # simplest for class

Unparseable date rows: 11


# Remove duplicates

In [10]:
before = len(df)
# Keep the first occurrence of each duplicate and drop the rest
df = df.drop_duplicates(keep='first')
after = len(df)
print(f"Removed {before - after} duplicate rows (kept the first occurrence of each).")

Removed 0 duplicate rows (kept the first occurrence of each).


# Save the cleaned dataset

In [11]:
df.to_csv("cleaned_telecom_usage.csv", index=False)
print("Saved cleaned_telecom_usage.csv with", len(df), "rows.")

Saved cleaned_telecom_usage.csv with 1 rows.


# Verify the save

In [12]:
pd.set_option('display.max_rows', None) # show all rows in output
cleaned_df = pd.read_csv("cleaned_telecom_usage.csv")
display(cleaned_df)

Unnamed: 0,customer_id,data_used_gb,calls_made,revenue_inr,region,date
0,1001,5.2,25,180.0,Delhi,2025-09-25


# Create complaints.csv

In [13]:
complaints = pd.DataFrame([
{"complaint_id":"CMP-001", "customer_id":1002, "category":"Billing", "description": "Charged extra for data usage", "created_at":"2025/09/25 10:45","status":"Open"},

{"complaint_id":"CMP-002","customer_id":1004,"category":"Network","description":"Frequent call drops in Delhi","created_at":"2025-09-25 09:30","status":"Open"},

{"complaint_id":"CMP-003","customer_id":1005,"category":"Recharge","description":"Recharge failed; amount deducted","created_at":"25-09-2025 14:00","status":"Closed"},

{"complaint_id":"CMP-004","customer_id":1002,"category":"Network","description":"Slow 4G speed at night","created_at":"2025-09-26 20:40","status":"Open"},

{"complaint_id":"CMP-005","customer_id":1003,"category":"Support","description":"No response to complaint","created_at":"2025-09-26 11:10","status":"Open"}
])

In [15]:
complaints.to_csv("complaints.csv", index=False)
print(" complaints.csv saved.")

 complaints.csv saved.
