<a href="https://colab.research.google.com/github/kemboikaroney/WK7-Telecommunications-Fraud-Detection-Using-MongoDB-and-Python/blob/main/Telecommunications_Fraud_Detection_Using_MongoDB_and_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import pymongo
import logging

# Extraction function
def extract_data(call_log_path, billing_data_path):
    # Load call log data from CSV file
    call_logs = pd.read_csv(call_log_path)

    # Load billing data from CSV file
    billing_data = pd.read_csv(billing_data_path)

    # create common column name
    call_logs = call_logs.rename(columns={"call_date": "date"})
    billing_data = billing_data.rename(columns={"transaction_date": "date"})

    # Merge the two datasets based on common columns
    merged_data = pd.merge(call_logs, billing_data, on=['date'])

    # Convert call duration to minutes for easier analysis
    merged_data['duration_minutes'] = merged_data['call_duration'] / 60

    # Use Python logging module to log errors and activities
    logger = logging.getLogger(__name__)
    logger.info("Data extraction completed.")

    return merged_data

# Transformation function
def transform_data(df):
    # Data cleaning and handling missing values
    df = df.dropna()
    df = df.drop_duplicates()

    # Group and aggregate the data
    grouped_data = df.groupby(['customer_id']).agg(
        total_duration=('duration_minutes', 'sum'),
        total_calls=('call_type', 'count')
    ).reset_index()

    # Identify patterns in the data
    grouped_data['is_fraudulent'] = (grouped_data['total_duration'] > 300) & (grouped_data['total_calls'] > 100)

    # Use data compression techniques to optimize performance
    grouped_data = grouped_data.astype({'customer_id': 'int32', 'is_fraudulent': 'bool'})

    # Use Python logging module to log errors and activities
    logger = logging.getLogger(__name__)
    logger.info("Data transformation completed.")

    return grouped_data

# Loading function
def load_data(data, db_uri, db_name, collection_name):
    # Connect to MongoDB
    client = pymongo.MongoClient(db_uri, ssl=True, ssl_cert_reqs='CERT_NONE')
    db = client[db_name]
    collection = db[collection_name]

    # Create indexes on the collection
    collection.create_index([('customer_id', pymongo.ASCENDING)])
    collection.create_index([('is_fraudulent', pymongo.ASCENDING)])

    # Use bulk inserts to optimize performance
    bulk_data = data.to_dict(orient='records')
    collection.insert_many(bulk_data)

    # Use the write concern option to ensure that data is written to disk
    collection.acknowledge_writes(w=1, j=True)

    # Use Python logging module to log errors and activities
    logger = logging.getLogger(__name__)
    logger.info("Data loading completed.")

# Example usage
if __name__ == '__main__':
    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Define file paths
    call_log_path = 'call_logs.csv'
    billing_data_path = 'billing_systems.csv'

    # Extract data
    data = extract_data(call_log_path, billing_data_path)

    # Transform data
    transformed_data = transform_data(data)

    # Load data into MongoDB
    db_uri = "mongodb+srv://kemboi:VknWvgRF4C5NG91S@cluster0.9fdgixi.mongodb.net/?retryWrites=true&w=majority"
    db_name = 'mydatabase'
    collection_name = 'mycollection'
    load_data(transformed_data, db_uri, db_name, collection_name)


ConfigurationError: ignored