# TCP Classification - Model Training and Evaluation

This notebook assumes the CapstoneProject_eda notebook was run to generate the cleaned and labeled data set.

# 1. Data Loading and Prep

## Load and Split Data

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('data/cleaned_data.csv')

# Split into features (X) and target (y)
X = data.drop('label', axis=1)
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707534 entries, 0 to 707533
Data columns (total 36 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   SourceIP                                707534 non-null  int64  
 1   DestinationIP                           707534 non-null  int64  
 2   SourcePort                              707534 non-null  int64  
 3   DestinationPort                         707534 non-null  int64  
 4   Duration                                707534 non-null  float64
 5   FlowBytesSent                           707534 non-null  int64  
 6   FlowSentRate                            707534 non-null  float64
 7   FlowBytesReceived                       707534 non-null  int64  
 8   FlowReceivedRate                        707534 non-null  float64
 9   PacketLengthVariance                    707534 non-null  float64
 10  PacketLengthStandardDeviation           7075

## Data Scaling

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 2. Modeling

### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_model = LogisticRegression(random_state=42, max_iter=500, solver='saga', n_jobs=-1)
logistic_cv_scores = cross_val_score(logistic_model, X, y, cv=5, scoring='f1')



### Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

# Initialize random forest
random_forest_model = RandomForestClassifier(random_state=42, n_estimators=50, n_jobs=-1)

rf_cv_scores = cross_val_score(random_forest_model, X, y, cv=5, scoring='f1')

Accuracy: 1.0


### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize support vector machine
svm_model = SVC(kernel='linear', random_state=42)

svm_cv_scores = cross_val_score(svm_model, X, y, cv=5, scoring='f1', n_jobs=-1)

## 3. Comparison of Models

In [None]:
print(f'Logistic Regression Mean F1-Score: {logistic_cv_scores.mean()}')
print(f'Random Forest Mean F1-Score: {rf_cv_scores.mean()}')
print(f'SVM Mean F1-Score: {svm_cv_scores.mean()}')