<a href="https://colab.research.google.com/github/jesuisaditya10/Predicting-Network-Flow-Duration/blob/main/Predicting_Network_Flow_Duration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from datetime import datetime

plt.style.use('ggplot')
sns.set_palette("husl")

# --- Function Definitions ---

def load_and_validate_data(file_path):
    """Load the dataset from a CSV file and validate required columns."""
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully!")
        print(f"Dataset shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        raise
    except Exception as e:
        print(f"Error loading the CSV file: {e}")
        raise

    required_columns = [
        'Flow.ID', 'Source.IP', 'Source.Port', 'Destination.IP',
        'Destination.Port', 'Protocol', 'Timestamp', 'Flow.Duration',
        'Total.Fwd.Packets', 'Total.Backward.Packets'
    ]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print("Error: The CSV file does not contain all required columns.")
        print(f"Missing columns: {missing_columns}")
        raise ValueError("Missing required columns")
    return df

def explore_data(df):
    """Generate summary statistics and visualizations for data exploration."""
    print("\n--- Data Exploration ---")
    print(df.describe())

    plt.figure(figsize=(8, 6))
    sns.histplot(df['Flow.Duration'], kde=True)
    plt.title('Distribution of Flow Duration')
    plt.xlabel('Flow Duration')
    plt.ylabel('Frequency')
    plt.savefig('flow_duration_distribution.png')
    plt.close()

    plt.figure(figsize=(12, 10))
    numeric_df = df.select_dtypes(include=[np.number])
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.savefig('correlation_heatmap.png')
    plt.close()

def preprocess_data(df):
    """Preprocess the dataset: validate data types, handle outliers, and engineer features."""
    numeric_columns = ['Source.Port', 'Destination.Port', 'Protocol', 'Flow.Duration', 'Total.Fwd.Packets', 'Total.Backward.Packets']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        if df[col].isna().any():
            print(f"Warning: Column '{col}' contains non-numeric values. Rows with invalid values will be dropped.")
    df = df.dropna(subset=numeric_columns)

    Q1 = df['Flow.Duration'].quantile(0.25)
    Q3 = df['Flow.Duration'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df['Flow.Duration'] < lower_bound) | (df['Flow.Duration'] > upper_bound)]
    if not outliers.empty:
        print(f"Detected {len(outliers)} outliers in Flow.Duration. Removing them...")
        df = df[(df['Flow.Duration'] >= lower_bound) & (df['Flow.Duration'] <= upper_bound)]

    def ip_to_numeric(ip):
        try:
            parts = str(ip).split('.')
            if len(parts) != 4:
                raise ValueError("Invalid IP format")
            return int(parts[0]) * (256**3) + int(parts[1]) * (256**2) + int(parts[2]) * 256 + int(parts[3])
        except:
            return 0

    df['Source.IP.Num'] = df['Source.IP'].apply(ip_to_numeric)
    df['Destination.IP.Num'] = df['Destination.IP'].apply(ip_to_numeric)

    possible_formats = ['%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M:%S']
    timestamp_parsed = False
    for fmt in possible_formats:
        try:
            df['Timestamp'] = pd.to_datetime(df['Timestamp'], format=fmt)
            timestamp_parsed = True
            break
        except ValueError:
            continue
    if not timestamp_parsed:
        print("Error: Unable to parse Timestamp column.")
        raise ValueError("Invalid Timestamp format")

    df['Hour'] = df['Timestamp'].dt.hour
    df['Minute'] = df['Timestamp'].dt.minute
    df['Second'] = df['Timestamp'].dt.second

    df['Total.Packets'] = df['Total.Fwd.Packets'] + df['Total.Backward.Packets']
    df['Packet.Ratio'] = df['Total.Fwd.Packets'] / (df['Total.Backward.Packets'] + 1e-5)
    df['Duration.Per.Packet'] = df['Flow.Duration'] / (df['Total.Packets'] + 1e-5)

    df = df.drop(['Flow.ID', 'Source.IP', 'Destination.IP', 'Timestamp'], axis=1)
    return df

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, scaled=False):
    """Train a model, evaluate it, and return performance metrics."""
    if isinstance(model, GridSearchCV):
        model.fit(X_train, y_train)
        best_model = model.best_estimator_
        print(f"Best parameters for {model_name}: {model.best_params_}")
    else:
        best_model = model
        best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    cv_scores = cross_val_score(best_model, X_train if not scaled else scaler.transform(X_train), y_train, cv=5, scoring='r2')
    cv_mean = cv_scores.mean()

    print(f"\n--- {model_name} ---")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R-squared Score: {r2:.2f}")
    print(f"Cross-Validation R-squared: {cv_mean:.2f}")

    return y_pred, {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'CV_R2': cv_mean}, best_model

def plot_actual_vs_predicted(y_test, predictions, model_names):
    """Plot actual vs predicted values for each model."""
    plt.figure(figsize=(12, 8))
    for y_pred, model_name in zip(predictions, model_names):
        plt.scatter(y_test, y_pred, label=model_name, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Flow Duration')
    plt.ylabel('Predicted Flow Duration')
    plt.title('Actual vs Predicted Flow Duration')
    plt.legend()
    plt.grid(True)
    plt.savefig('actual_vs_predicted.png')
    plt.close()

def plot_feature_importance(model, model_name, feature_names):
    """Plot feature importance for tree-based models."""
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        plt.figure(figsize=(10, 6))
        plt.bar(range(len(importances)), importances[indices], align='center')
        plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha='right')
        plt.title(f'Feature Importance - {model_name}')
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{model_name.lower().replace(" ", "_")}.png')
        plt.close()

def plot_model_comparison(metrics_dict):
    """Plot a comparison of model performance metrics."""
    metrics_df = pd.DataFrame(metrics_dict).T
    metrics_df.plot(kind='bar', figsize=(12, 8))
    plt.title('Model Performance Comparison')
    plt.ylabel('Score')
    plt.xlabel('Model')
    plt.xticks(rotation=45)
    plt.legend(title='Metrics')
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()

# --- Main Execution ---

file_path = '/content/Network_rate_data.csv'
df = load_and_validate_data(file_path)
explore_data(df)

df = preprocess_data(df)

X = df.drop('Flow.Duration', axis=1)
y = df['Flow.Duration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LinearRegression()
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
rf_model = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=5, scoring='r2')
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7]}
xgb_model = GridSearchCV(xgb.XGBRegressor(random_state=42, objective='reg:squarederror'), xgb_params, cv=5, scoring='r2')
svr_model = GridSearchCV(SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}, cv=5, scoring='r2')

models = [
    (lr_model, "Linear Regression", True),
    (rf_model, "Random Forest", False),
    (xgb_model, "XGBoost", True),
    (svr_model, "Support Vector Regressor", True)
]

predictions = []
metrics_dict = {}
best_models = {}
for model, name, scaled in models:
    X_t = X_train_scaled if scaled else X_train
    X_te = X_test_scaled if scaled else X_test
    y_pred, metrics, best_model = train_and_evaluate_model(model, X_t, X_te, y_train, y_test, name, scaled)
    predictions.append(y_pred)
    metrics_dict[name] = metrics
    best_models[name] = best_model

    if name in ["Random Forest", "XGBoost"]:
        plot_feature_importance(best_model, name, X.columns)

plot_actual_vs_predicted(y_test, predictions, [name for _, name, _ in models])
plot_model_comparison(metrics_dict)

metrics_df = pd.DataFrame(metrics_dict).T
metrics_df.to_csv('model_metrics.csv')

best_model = max(metrics_dict.items(), key=lambda x: x[1]['R2'])
best_model_name = best_model[0]
best_model_instance = best_models[best_model_name]
print(f"\nBest Model (based on R-squared): {best_model_name}")

scaled = any(name == best_model_name and scaled for _, name, scaled in models)
sample = (X_test_scaled[-1] if scaled else X_test.iloc[-1].values).reshape(1, -1)
predicted_duration = best_model_instance.predict(sample)
print(f"Predicted Flow Duration for the last test sample: {predicted_duration[0]:.2f}")

print("\n--- Summary of Findings ---")
print(f"The best-performing model was {best_model_name} with an R-squared of {metrics_dict[best_model_name]['R2']:.2f}.")
print("Key findings:")
print("- Random Forest and XGBoost generally outperformed Linear Regression and SVR.")
print("- Feature importance plots highlight key predictors like Total.Fwd.Packets and Protocol.")

Data loaded successfully!
Dataset shape: (1500, 10)

--- Data Exploration ---
        Source.Port  Destination.Port     Protocol  Flow.Duration  \
count   1500.000000       1500.000000  1500.000000     1500.00000   
mean   33215.566000       4666.183333    11.500000     4174.94000   
std    18774.521131      12602.944308     5.501834     2806.24163   
min     1041.000000         21.000000     6.000000      -82.00000   
25%    16409.500000         22.000000     6.000000     1951.50000   
50%    33594.500000         53.000000    11.500000     3464.00000   
75%    49425.250000        443.000000    17.000000     5741.50000   
max    65501.000000      64536.000000    17.000000    14618.00000   

       Total.Fwd.Packets  Total.Backward.Packets  
count        1500.000000             1500.000000  
mean           76.363333               50.160667  
std            43.347203               28.827826  
min             1.000000                1.000000  
25%            39.000000               24.000




--- Linear Regression ---
Mean Squared Error (MSE): 699769.68
Root Mean Squared Error (RMSE): 836.52
Mean Absolute Error (MAE): 616.83
R-squared Score: 0.91
Cross-Validation R-squared: 0.90
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}

--- Random Forest ---
Mean Squared Error (MSE): 66902.13
Root Mean Squared Error (RMSE): 258.65
Mean Absolute Error (MAE): 138.32
R-squared Score: 0.99
Cross-Validation R-squared: 0.99
Best parameters for XGBoost: {'max_depth': 5, 'n_estimators': 200}





--- XGBoost ---
Mean Squared Error (MSE): 40489.67
Root Mean Squared Error (RMSE): 201.22
Mean Absolute Error (MAE): 138.59
R-squared Score: 0.99
Cross-Validation R-squared: 0.99
Best parameters for Support Vector Regressor: {'C': 10, 'kernel': 'linear'}





--- Support Vector Regressor ---
Mean Squared Error (MSE): 792991.42
Root Mean Squared Error (RMSE): 890.50
Mean Absolute Error (MAE): 623.89
R-squared Score: 0.90
Cross-Validation R-squared: 0.02

Best Model (based on R-squared): XGBoost
Predicted Flow Duration for the last test sample: 3240.51

--- Summary of Findings ---
The best-performing model was XGBoost with an R-squared of 0.99.
Key findings:
- Random Forest and XGBoost generally outperformed Linear Regression and SVR.
- Feature importance plots highlight key predictors like Total.Fwd.Packets and Protocol.
