##Example 1

In [21]:
import pandas as pd
import numpy as np


# Assuming the file path is correct and accessible
file_path = "/content/taxi-data-sorted-small.csv"

# Use pandas to read the CSV file
df = pd.read_csv(file_path)

columns = [
    "medallion", "hack_license", "pickup_datetime", "dropoff_datetime",
    "trip_time_in_secs", "trip_distance", "pickup_longitude", "pickup_latitude",
    "dropoff_longitude", "dropoff_latitude", "payment_type", "fare_amount",
    "surcharge", "mta_tax", "tip_amount", "tolls_amount", "total_amount"
]

df.columns = columns

# To display the first few rows of the DataFrame, use the .head() method
print(df.head())


                          medallion                      hack_license  \
0  22D70BF00EEB0ADC83BA8177BB861991  3FF2709163DE7036FCAA4E5A3324E4BF   
1  0EC22AAF491A8BD91F279350C2B010FD  778C92B26AE78A9EBDF96B49C67E4007   
2  1390FB380189DF6BBFDA4DC847CAD14F  BE317B986700F63C43438482792C8654   
3  3B4129883A1D05BE89F2C929DE136281  7077F9FD5AD649AEACA4746B2537E3FA   
4  5FAA7F69213D26A42FA435CA9511A4FF  00B7691D86D96AEBD21DD9E138F90840   

       pickup_datetime     dropoff_datetime  trip_time_in_secs  trip_distance  \
0  2013-01-01 00:02:00  2013-01-01 00:02:00                  0           0.00   
1  2013-01-01 00:01:00  2013-01-01 00:03:00                120           0.71   
2  2013-01-01 00:01:00  2013-01-01 00:03:00                120           0.48   
3  2013-01-01 00:01:00  2013-01-01 00:03:00                120           0.61   
4  2013-01-01 00:02:00  2013-01-01 00:03:00                 60           0.00   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude 

Step 1: Setup RSA Encryption Environment
First, you'd generate RSA keys. In practice, these keys would be generated securely and stored with strict access controls.



In [19]:
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.primitives.asymmetric import padding
from cryptography.hazmat.primitives import hashes

# Generate RSA key pair
def generate_rsa_key_pair():
    private_key = rsa.generate_private_key(
        public_exponent=65537,
        key_size=2048,
        backend=default_backend()
    )
    public_key = private_key.public_key()
    return private_key, public_key

# Encrypt data
def encrypt_data(public_key, data):
    encrypted_data = public_key.encrypt(
        data.encode(),
        padding.OAEP(
            mgf=padding.MGF1(algorithm=hashes.SHA256()),
            algorithm=hashes.SHA256(),
            label=None
        )
    )
    return encrypted_data

# Decrypt data
def decrypt_data(private_key, encrypted_data):
    decrypted_data = private_key.decrypt(
        encrypted_data,
        padding.OAEP(
            mgf=padding.MGF1(algorithm=hashes.SHA256()),
            algorithm=hashes.SHA256(),
            label=None
        )
    )
    return decrypted_data.decode()


Step 2: Encrypt Sensitive Data in the DataFrame
For demonstration, let's encrypt only the first few records to illustrate the process, as RSA encryption is computationally expensive for large datasets.

In [22]:
# Generate RSA keys
private_key, public_key = generate_rsa_key_pair()

# Define a function to encrypt a DataFrame column
def encrypt_column(df, column_name, public_key):
    # Apply encryption to each value in the column
    return df[column_name].apply(lambda x: encrypt_data(public_key, str(x)))

# Encrypt sensitive columns
for column in ['fare_amount', 'tip_amount', 'total_amount']:
    df[f'encrypted_{column}'] = encrypt_column(df.head(5), column, public_key)  # Encrypting only the first 5 for demonstration


Step 3: Decrypt an Encrypted Value
To decrypt, you would use the decrypt_data function with the private key.




In [23]:

decrypted_value = decrypt_data(private_key, df.iloc[0]['encrypted_fare_amount'])
print(f"Decrypted fare amount: {decrypted_value}")


Decrypted fare amount: 27.0


##Example 2

Step 1:
Loading of the dataset that we have used earlier

Step 2: Feature Selection
For simplicity, we'll focus on fare_amount and trip_distance as our features for detecting anomalies.



In [24]:
# Selecting features for anomaly detection
features = df[['fare_amount', 'trip_distance']]


Step 3: Anomaly Detection with Isolation Forest

In [25]:
from sklearn.ensemble import IsolationForest

# Initialize the Isolation Forest model
# The contamination parameter is an estimate of the proportion of outliers in the data set
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)

# Fit the model
model.fit(features)

# Predict anomalies (-1 for outliers and 1 for inliers)
df['anomaly'] = model.predict(features)

# Count the number of detected anomalies
anomalies = df[df['anomaly'] == -1]
print(f"Detected {len(anomalies)} anomalies out of {len(df)} records.")




Detected 19911 anomalies out of 1999998 records.


Step 4: Analyzing the Detected Anomalies
After detecting anomalies, you might want to examine them to understand their nature.

In [26]:
# Display a few anomalies
print(anomalies[['fare_amount', 'trip_distance', 'pickup_datetime', 'dropoff_datetime']].head())


      fare_amount  trip_distance      pickup_datetime     dropoff_datetime
187          63.0           0.00  2013-01-01 00:11:00  2013-01-01 00:11:00
570          52.0           0.00  2013-01-01 00:16:00  2013-01-01 00:16:00
951         259.0           0.00  2013-01-01 00:19:00  2013-01-01 00:19:00
1171         52.0           0.02  2013-01-01 00:20:00  2013-01-01 00:20:00
1309         75.0           0.00  2013-01-01 00:21:00  2013-01-01 00:21:00
