### Data Exploration and Cleanup (basic) script
Our aim here is to understand our dataset and do a basic cleanup removing NaNs & Duplicates. 


In [6]:
!pip install boto3
import boto3
import pandas as pd
import io
import pandas as pd
import numpy as np
# Replace with your bucket and key
bucket_name = 'demand-prediction-ola-rides'
file_key = 'raw_data.csv'  # e.g. 'data/mydata.csv'

# Create a Boto3 client
s3 = boto3.client('s3')

# Read the file content into a BytesIO object
response = s3.get_object(Bucket=bucket_name, Key=file_key)
content = response['Body'].read()




# Convert to DataFrame
df = pd.read_csv(io.BytesIO(content), compression='gzip')

# Display the first few rows
df.head(5)

                    ts number   pick_lat   pick_lng   drop_lat   drop_lng
0  2020-03-26 07:07:17  14626  12.313621  76.658195  12.287301  76.602280
1  2020-03-26 07:32:27  85490  12.943947  77.560745  12.954014  77.543770
2  2020-03-26 07:36:44  05408  12.899603  77.587300  12.934780  77.569950
3  2020-03-26 07:38:00  58940  12.918229  77.607544  12.968971  77.636375
4  2020-03-26 07:39:29  05408  12.899490  77.587270  12.934780  77.569950
  exec(code, global_dict)


In [4]:
!ls

'1. Data Cleaning (Basic).ipynb'		 '5. Prediction Pipeline.ipynb'
'2. Data Analysis and Cleaning (Advance).ipynb'   Untitled.ipynb
'3. Data Prep.ipynb'				  Untitled1.ipynb
'4. Model_Training.ipynb'			  requirements.txt


#### Reading DataSet

In [7]:
len(df)

8381556


### A Customer_ID `number` at a particular timestamp can only have one entry
### Removing Duplicate Entries ['ts','number']

In [8]:
df[df.duplicated(subset=['ts','number'],keep=False)]

                          ts number   pick_lat   pick_lng   drop_lat   drop_lng
235      2020-03-26 18:10:35  16795  12.967236  77.641594  13.014504  77.650856
236      2020-03-26 18:10:35  16795  12.967236  77.641594  13.014504  77.650856
407      2020-03-26 21:35:50  65856  12.917173  77.586400  12.913940  77.685280
408      2020-03-26 21:35:50  65856  12.917173  77.586400  12.913940  77.685280
443      2020-03-26 23:26:29  27554  12.933715  77.619300  12.938208  77.587520
...                      ...    ...        ...        ...        ...        ...
8381231  2021-03-26 22:23:12  61636  12.975229  77.620370  13.017285  77.618200
8381245  2021-03-26 22:25:13  61636  12.975229  77.620370  13.017285  77.618200
8381246  2021-03-26 22:25:13  61636  12.975229  77.620370  13.017285  77.618200
8381248  2021-03-26 22:25:27  61636  12.975229  77.620370  13.017285  77.618200
8381249  2021-03-26 22:25:27  61636  12.975229  77.620370  13.017285  77.618200

[113540 rows x 6 columns]


### There are 113540 Duplicate Entries
#### We have 8315498 Unique timestamp, customer_id rows. 

In [9]:
## Keeping first occurence
df.drop_duplicates(subset=['ts','number'], inplace = True, keep = 'last')

df.reset_index(inplace = True, drop = True)




In [10]:
# Info of Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8315498 entries, 0 to 8315497
Data columns (total 6 columns):
 #   Column    Dtype  
---  ------    -----  
 0   ts        object 
 1   number    object 
 2   pick_lat  float64
 3   pick_lng  float64
 4   drop_lat  float64
 5   drop_lng  float64
dtypes: float64(4), object(2)
memory usage: 380.7+ MB


In [11]:
#Count missing values
np.count_nonzero(df.isnull().values)

0


In [12]:
df['number'] = pd.to_numeric(df['number'], errors = 'coerce')

#Count missing values
np.count_nonzero(df.isnull().values)

116


#### There are 116 NaN rows, dropping NaN rows.

In [13]:
df.dropna(inplace = True)
len(df)

8315382


In [14]:
df['number'] = pd.to_numeric(df['number'], errors = 'coerce', downcast='integer')
df['ts'] = pd.to_datetime(df['ts'])




In [15]:
# Info of Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8315382 entries, 0 to 8315497
Data columns (total 6 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ts        datetime64[ns]
 1   number    int32         
 2   pick_lat  float64       
 3   pick_lng  float64       
 4   drop_lat  float64       
 5   drop_lng  float64       
dtypes: datetime64[ns](1), float64(4), int32(1)
memory usage: 412.4 MB


### Breaking Time to Features

In [16]:
df['hour'] = df['ts'].dt.hour
df['mins'] = df['ts'].dt.minute
df['day'] = df['ts'].dt.day
df['month'] = df['ts'].dt.month
df['year'] = df['ts'].dt.year
df['dayofweek'] = df['ts'].dt.dayofweek




In [None]:
import io
import boto3

# Assuming df is your DataFrame and s3 client and bucket_name defined

buffer = io.BytesIO()
df.to_csv(buffer, index=False, compression='gzip')  # <-- Add compression here
buffer.seek(0)  # Reset pointer to start

# Upload to S3
s3.put_object(Bucket=bucket_name, Key='processed/preprocessed_11.csv.gz', Body=buffer.getvalue())

print("In-memory gzip-compressed CSV uploaded successfully!")


In-memory gzip-compressed CSV uploaded successfully!


In [None]:
print(f"Size in MB: {response['ContentLength'] / (1024*1024):.2f}")


Size in MB: 105.97


In [None]:
df.head(5)

                   ts  number   pick_lat  ...  month  year  dayofweek
0 2020-03-26 07:07:17   14626  12.313621  ...      3  2020          3
1 2020-03-26 07:32:27   85490  12.943947  ...      3  2020          3
2 2020-03-26 07:36:44    5408  12.899603  ...      3  2020          3
3 2020-03-26 07:38:00   58940  12.918229  ...      3  2020          3
4 2020-03-26 07:39:29    5408  12.899490  ...      3  2020          3

[5 rows x 12 columns]
