In [1]:
# outlier
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Load your dataset (replace 'your_dataset.csv' with your actual dataset file)
data = pd.read_csv('your_dataset.csv')

# Selecting the features for Isolation Forest
features = ['trip_distance', 'total_amount']
X = data[features]

# Training the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.01)  # Adjust the contamination parameter as needed
isolation_forest.fit(X)

# Predicting anomalies (outliers)
anomaly_scores = isolation_forest.decision_function(X)
outliers = isolation_forest.predict(X)

# Adding anomaly scores and outlier predictions to the original dataset
data['anomaly_score'] = anomaly_scores
data['is_outlier'] = outliers

# Visualizing the anomaly scores
plt.figure(figsize=(10, 6))
plt.hist(anomaly_scores, bins=50, alpha=0.5, color='blue', edgecolor='black')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Distribution of Anomaly Scores')
plt.show()

# Visualizing outliers
plt.figure(figsize=(10, 6))
plt.scatter(data['trip_distance'], data['total_amount'], c=data['is_outlier'], cmap='viridis')
plt.xlabel('Trip Distance')
plt.ylabel('Total Amount')
plt.title('Outlier Detection with Isolation Forest')
plt.colorbar(label='Outlier (1) / Inlier (-1)')
plt.show()

# Displaying the detected outliers
detected_outliers = data[data['is_outlier'] == -1]
print("Detected outliers:")
print(detected_outliers)

In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [None]:
data = dd.read_parquet(r'City_Cab_Insights\Data\yellow_tripdata_2023-02.parquet')
# Convert Dask DataFrame to Pandas DataFrame
data = data.compute()

In [None]:
# Selecting the features for Isolation Forest
features = ['trip_distance', 'fare_amount']
X = data[features]

# Define a custom scoring function
def custom_score(estimator, X):
    # Get the anomaly scores
    anomaly_scores = estimator.decision_function(X)
    # Calculate the negative of the average anomaly score
    return -np.mean(anomaly_scores)

# Create the scorer using the custom scoring function
scorer = make_scorer(custom_score, greater_is_better=False)
scorer

In [None]:
# Define the range of contamination values to search over
contamination_values = [0.01, 0.05, 0.1, 0.15, 0.2]

# Create a dictionary of parameters to search
param_grid = {'contamination': contamination_values}

# Initialize Isolation Forest
isolation_forest = IsolationForest()

# Initialize GridSearchCV with the custom scorer
grid_search = GridSearchCV(estimator=isolation_forest, param_grid=param_grid, cv=5, scoring=scorer)

In [None]:
# Perform grid search
grid_search.fit(X)

# Get the best contamination parameter
best_contamination = grid_search.best_params_['contamination']
print("Best contamination parameter:", best_contamination) # Best contamination is 0.01

In [None]:
# Initialize Isolation Forest with the best contamination parameter
best_isolation_forest = IsolationForest(contamination=best_contamination)

# Train the Isolation Forest model with the best contamination parameter
best_isolation_forest.fit(X)

# Predicting anomalies (outliers) with the best model
best_anomaly_scores = best_isolation_forest.decision_function(X)
best_outliers = best_isolation_forest.predict(X)

# Adding anomaly scores and outlier predictions to the original dataset
data['best_anomaly_score'] = best_anomaly_scores
data['best_is_outlier'] = best_outliers

# Displaying the detected outliers
best_detected_outliers = data[data['best_is_outlier'] == -1]
print("Detected outliers with the best model:")
print(best_detected_outliers)
