In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [5]:
# Load the processed data
data = pd.read_csv('Processed.csv')

# Separate features and target labels
X = data.drop('app', axis=1)  # Replace 'label' with the actual target column name
y = data['app']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [6]:
pca = PCA().fit(X_scaled)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1  # Choose components that reach 95% variance
print("Optimal number of components:", n_components)

# Applying PCA with optimal components
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

Optimal number of components: 15


In [7]:
# Define autoencoder structure
input_dim = X_pca.shape[1]
encoding_dim = 25  # Latent space dimension, adjust as needed

# Encoder
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)

# Decoder
decoder = Dense(input_dim, activation="sigmoid")(encoder)

# Autoencoder Model
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

# Train autoencoder
autoencoder.fit(X_pca, X_pca, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)


Epoch 1/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 2.0401 - val_loss: 1.6460
Epoch 2/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.9319 - val_loss: 1.5844
Epoch 3/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1.8022 - val_loss: 1.5315
Epoch 4/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.9527 - val_loss: 1.4885
Epoch 5/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.1284 - val_loss: 1.4532
Epoch 6/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.7735 - val_loss: 1.4216
Epoch 7/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.5900 - val_loss: 1.3961
Epoch 8/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.5451 - val_loss: 1.3732
Epoch 9/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1e086220f90>

In [8]:
encoder_model = Model(inputs=input_layer, outputs=encoder)
X_latent = encoder_model.predict(X_pca)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_latent, y, test_size=0.3, random_state=42)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [9]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [10]:
# Predict and evaluate on test data
y_pred = classifier.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.7203389830508474
Classification Report:
               precision    recall  f1-score   support

           1       0.50      0.46      0.48        13
           2       0.67      0.50      0.57        16
           3       0.89      0.93      0.91        44
           4       0.71      0.68      0.70        22
           5       0.87      0.89      0.88        46
           6       0.73      0.90      0.81        63
           7       0.58      0.70      0.64        10
           8       0.67      0.24      0.35        17
           9       0.50      0.46      0.48        13
          10       0.54      0.35      0.42        20
          11       0.43      0.55      0.48        22
          12       0.82      0.72      0.77        39
          13       0.62      0.89      0.73         9
          14       0.75      0.75      0.75        20

    accuracy                           0.72       354
   macro avg       0.66      0.64      0.64       354
weighted avg       0.72    

In [11]:
unique_classes = data['app'].unique()
for class_label in unique_classes:
  y_true_binary = (y_test == class_label)
  y_pred_binary = (y_pred == class_label)
  print(f"\nClassification Report for Class {class_label}:")
  print(classification_report(y_true_binary, y_pred_binary))


Classification Report for Class 1:
              precision    recall  f1-score   support

       False       0.98      0.98      0.98       341
        True       0.50      0.46      0.48        13

    accuracy                           0.96       354
   macro avg       0.74      0.72      0.73       354
weighted avg       0.96      0.96      0.96       354


Classification Report for Class 2:
              precision    recall  f1-score   support

       False       0.98      0.99      0.98       338
        True       0.67      0.50      0.57        16

    accuracy                           0.97       354
   macro avg       0.82      0.74      0.78       354
weighted avg       0.96      0.97      0.96       354


Classification Report for Class 3:
              precision    recall  f1-score   support

       False       0.99      0.98      0.99       310
        True       0.89      0.93      0.91        44

    accuracy                           0.98       354
   macro avg       0

In [12]:
#without scaling technique
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set 
y_pred = rf_model.predict(X_test)

# Calculate accuracy and display classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.8813559322033898
Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.75      0.60         4
           2       0.75      0.60      0.67        10
           3       1.00      0.97      0.98        32
           4       0.87      1.00      0.93        13
           5       1.00      0.93      0.97        30
           6       0.93      1.00      0.96        40
           7       0.80      0.89      0.84         9
           8       0.43      0.33      0.38         9
           9       0.56      0.62      0.59         8
          10       0.90      0.64      0.75        14
          11       0.81      0.81      0.81        16
          12       0.90      0.93      0.91        28
          13       1.00      1.00      1.00         6
          14       0.94      1.00      0.97        17

    accuracy                           0.88       236
   macro avg       0.81      0.82      0.81       236
weighted avg       0.88     

In [13]:
#with standradization = z-score normalisation
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy and display classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.885593220338983
Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.75      0.60         4
           2       0.75      0.60      0.67        10
           3       1.00      0.97      0.98        32
           4       0.87      1.00      0.93        13
           5       1.00      0.93      0.97        30
           6       0.93      1.00      0.96        40
           7       0.80      0.89      0.84         9
           8       0.44      0.44      0.44         9
           9       0.56      0.62      0.59         8
          10       0.90      0.64      0.75        14
          11       0.81      0.81      0.81        16
          12       0.93      0.93      0.93        28
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00        17

    accuracy                           0.89       236
   macro avg       0.82      0.83      0.82       236
weighted avg       0.89      

In [14]:
unique_classes = data['app'].unique()
for class_label in unique_classes:
  y_true_binary = (y_test == class_label)
  y_pred_binary = (y_pred == class_label)
  print(f"\nClassification Report for Class {class_label}:")
  print(classification_report(y_true_binary, y_pred_binary))


Classification Report for Class 1:
              precision    recall  f1-score   support

       False       1.00      0.99      0.99       232
        True       0.50      0.75      0.60         4

    accuracy                           0.98       236
   macro avg       0.75      0.87      0.80       236
weighted avg       0.99      0.98      0.98       236


Classification Report for Class 2:
              precision    recall  f1-score   support

       False       0.98      0.99      0.99       226
        True       0.75      0.60      0.67        10

    accuracy                           0.97       236
   macro avg       0.87      0.80      0.83       236
weighted avg       0.97      0.97      0.97       236


Classification Report for Class 3:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       204
        True       1.00      0.97      0.98        32

    accuracy                           1.00       236
   macro avg       1

In [15]:
# Save trained models
classifier_filename = 'traffic_classifier.pkl'

import joblib
joblib.dump(rf_model, classifier_filename)


['traffic_classifier.pkl']

In [16]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the features using MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)

# Split the data into training and testing sets with normalized data
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Train the Random Forest model on normalized data
rf_model_norm = RandomForestClassifier(random_state=42)
rf_model_norm.fit(X_train_norm, y_train_norm)

# Make predictions on the test set with normalized data
y_pred_norm = rf_model_norm.predict(X_test_norm)

# Calculate accuracy and classification report for the normalized data
accuracy_norm = accuracy_score(y_test_norm, y_pred_norm)
report_norm = classification_report(y_test_norm, y_pred_norm)

print(f"Accuracy: {accuracy_norm}")
print("Classification Report:")
print(report_norm)


Accuracy: 0.8728813559322034
Classification Report:
              precision    recall  f1-score   support

           1       0.43      0.75      0.55         4
           2       0.75      0.60      0.67        10
           3       1.00      0.97      0.98        32
           4       0.75      0.92      0.83        13
           5       0.96      0.87      0.91        30
           6       0.93      1.00      0.96        40
           7       0.80      0.89      0.84         9
           8       0.50      0.44      0.47         9
           9       0.60      0.75      0.67         8
          10       0.89      0.57      0.70        14
          11       0.81      0.81      0.81        16
          12       0.96      0.93      0.95        28
          13       1.00      1.00      1.00         6
          14       0.94      1.00      0.97        17

    accuracy                           0.87       236
   macro avg       0.81      0.82      0.81       236
weighted avg       0.88     

In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# List of scalers to test
scalers = {
    "Z-Score Standardization": StandardScaler(),
    "Min-Max Scaling": MinMaxScaler(),
    "Max Absolute Scaling": MaxAbsScaler(),
    "Robust Scaling": RobustScaler(),
    "L2 Normalization": Normalizer(norm='l2')
}

results = {}

# Iterate over each scaling method
for name, scaler in scalers.items():
    print(f"Using {name}")
    
    # Scale features
    X_scaled = scaler.fit_transform(X)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Calculate accuracy and save results
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": report
    }

# Summary of results
print("\nSummary of Scaling Method Accuracies:")
for name, metrics in results.items():
    print(f"{name}: {metrics['Accuracy']:.4f}")


Using Z-Score Standardization
Accuracy: 0.885593220338983
Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.75      0.60         4
           2       0.75      0.60      0.67        10
           3       1.00      0.97      0.98        32
           4       0.87      1.00      0.93        13
           5       1.00      0.93      0.97        30
           6       0.93      1.00      0.96        40
           7       0.80      0.89      0.84         9
           8       0.44      0.44      0.44         9
           9       0.56      0.62      0.59         8
          10       0.90      0.64      0.75        14
          11       0.81      0.81      0.81        16
          12       0.93      0.93      0.93        28
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00        17

    accuracy                           0.89       236
   macro avg       0.82      0.83      0.82       236

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
#with standradization = z-score normalisation
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy and display classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.885593220338983
Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.75      0.60         4
           2       0.75      0.60      0.67        10
           3       1.00      0.97      0.98        32
           4       0.87      1.00      0.93        13
           5       1.00      0.93      0.97        30
           6       0.93      1.00      0.96        40
           7       0.80      0.89      0.84         9
           8       0.44      0.44      0.44         9
           9       0.56      0.62      0.59         8
          10       0.90      0.64      0.75        14
          11       0.81      0.81      0.81        16
          12       0.93      0.93      0.93        28
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00        17

    accuracy                           0.89       236
   macro avg       0.82      0.83      0.82       236
weighted avg       0.89      

In [5]:
#testing model
import pandas as pd

# Load the uploaded file
file_path = 'C:/Users/Lenovo/Documents/mini-project/Processed.csv'
data = pd.read_csv(file_path)

# Sample 30 rows from the data
sample_data = data.sample(n=30, random_state=1)

# Save the sampled data to a new CSV file
output_path = 'test_set.csv'
sample_data.to_csv(output_path, index=False)

output_path


'test_set.csv'

In [23]:

# Load the dataset
data = pd.read_csv('test_set.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Standardize the features
scaler = StandardScaler()
X_test = scaler.fit_transform(X)

y_pred = rf_model.predict(X_test)

results_df = pd.DataFrame({'Actual': y, 'Predicted': y_pred,'result':y==y_pred})

print(results_df)

    Actual  Predicted  result
0        7          6   False
1        6          6    True
2       13         13    True
3        5          5    True
4       13         13    True
5        3          3    True
6        7          6   False
7       14         13   False
8       10          8   False
9        6          6    True
10      10         10    True
11      14         14    True
12       9          9    True
13       4          5   False
14      14         13   False
15      14         14    True
16       3          3    True
17       5          5    True
18       6          6    True
19       3          3    True
20      12         12    True
21       4          4    True
22       6          6    True
23       3          3    True
24       6          6    True
25      12         12    True
26       6          6    True
27       2          1   False
28      14         13   False
29       7          6   False


In [25]:
accuracy = (results_df['result'].sum() / len(results_df)) * 100

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 70.00%


In [27]:
# Count the number of samples for each class in the 'app' column
cdata=pd.read_csv('Processed.csv')
class_counts = cdata['app'].value_counts()

print("Number of samples for each class:")
print(class_counts)


Number of samples for each class:
app
6     212
3     157
5     131
12    112
14     86
11     72
4      64
10     62
9      59
7      56
1      45
13     45
8      42
2      37
Name: count, dtype: int64


In [32]:
total_count = cdata['app'].value_counts().sum()
print(total_count)

1180


In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Print the initial class distribution
print("Original class distribution:")
print(y.value_counts())

# Step 1: Resampling - Use SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print the resampled class distribution
print("\nResampled class distribution:")
print(Counter(y_resampled))

# Step 2: Hyperparameter Tuning with GridSearchCV for Random Forest
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model after tuning
best_rf_model = grid_search.best_estimator_

# Step 3: Evaluate the model performance on the test set
y_pred = best_rf_model.predict(X_test_scaled)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOptimized Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Original class distribution:
app
6     212
3     157
5     131
12    112
14     86
11     72
4      64
10     62
9      59
7      56
1      45
13     45
8      42
2      37
Name: count, dtype: int64

Resampled class distribution:
Counter({1: 212, 2: 212, 3: 212, 4: 212, 5: 212, 6: 212, 7: 212, 8: 212, 9: 212, 10: 212, 11: 212, 12: 212, 13: 212, 14: 212})

Optimized Model Accuracy: 96.97%

Classification Report:
              precision    recall  f1-score   support

           1       1.00      0.98      0.99        45
           2       0.98      0.98      0.98        47
           3       0.98      0.94      0.96        52
           4       0.95      0.97      0.96        40
           5       0.98      0.94      0.96        47
           6       0.92      0.98      0.95        46
           7       0.96      0.98      0.97        45
           8       0.97      0.94      0.95        32
           9       0.98      1.00      0.99        43
          10       0.98      0.91      0.94 

In [36]:
# After performing GridSearchCV and getting the best model
print("Best hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)

# To see all the combinations of hyperparameters and their corresponding results:
results = grid_search.cv_results_

# Convert the results to a DataFrame for better visualization
results_df = pd.DataFrame(results)

# Displaying the top few rows of the DataFrame to see the results of different hyperparameter combinations
print("\nGrid Search Results (Top 5 combinations):")
print(results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head())


Best hyperparameters found by GridSearchCV:
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

Grid Search Results (Top 5 combinations):
                                              params  mean_test_score  \
0  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.925441   
1  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.927124   
2  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.932600   
3  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.920806   
4  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.923752   

   std_test_score  rank_test_score  
0        0.018675               59  
1        0.011900               58  
2        0.013196               55  
3        0.011290               67  
4        0.014520               62  


In [46]:
# Load the dataset
test_data = pd.read_csv('test_set.csv')

# Separate features and target
X = test_data.drop(columns=['app'])
y = test_data['app']

# Standardize the features
scaler = StandardScaler()
X_test = scaler.fit_transform(X)

y_pred = best_rf_model.predict(X_test)

results_df = pd.DataFrame({'Actual': y, 'Predicted': y_pred,'result':y==y_pred})

print(results_df)

    Actual  Predicted  result
0        7          6   False
1        6          6    True
2       13         13    True
3        5          5    True
4       13         13    True
5        3          3    True
6        7          6   False
7       14         14    True
8       10         10    True
9        6          6    True
10      10         10    True
11      14         14    True
12       9          9    True
13       4          4    True
14      14         14    True
15      14         14    True
16       3          3    True
17       5          5    True
18       6          6    True
19       3          3    True
20      12         12    True
21       4          4    True
22       6          6    True
23       3          3    True
24       6          6    True
25      12         12    True
26       6          6    True
27       2          1   False
28      14         13   False
29       7          6   False


In [47]:
accuracy = (results_df['result'].sum() / len(results_df)) * 100

print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 83.33%


In [40]:
# Resampling using SMOTE
from imblearn.over_sampling import SMOTE
import pandas as pd

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Print the initial class distribution
print("Original class distribution:")
print(y.value_counts())

# Step 1: Resampling - Use SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data['app'] = y_resampled

# Save the resampled data to 'balanced_smote.csv'
resampled_data.to_csv('balanced_smote.csv', index=False)

print("Resampled dataset saved to 'balanced.csv'")


Original class distribution:
app
6     212
3     157
5     131
12    112
14     86
11     72
4      64
10     62
9      59
7      56
1      45
13     45
8      42
2      37
Name: count, dtype: int64
Resampled dataset saved to 'balanced.csv'


In [41]:
# Resampling using smote-enn
from imblearn.combine import SMOTEENN
import pandas as pd

# Load the dataset
data = pd.read_csv('Processed.csv')

# Separate features and target
X = data.drop(columns=['app'])
y = data['app']

# Print the initial class distribution
print("Original class distribution:")
print(y.value_counts())

# Step 1: Resampling - Use SMOTEENN for oversampling and noise removal
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data['app'] = y_resampled

# Save the resampled data to 'balanced_smoteenn.csv'
resampled_data.to_csv('balanced_smoteenn.csv', index=False)

print("Resampled dataset saved to 'balanced_smoteenn.csv'")


Original class distribution:
app
6     212
3     157
5     131
12    112
14     86
11     72
4      64
10     62
9      59
7      56
1      45
13     45
8      42
2      37
Name: count, dtype: int64
Resampled dataset saved to 'balanced_smoteenn.csv'


In [42]:
print("\nResampled class distribution:")
print(Counter(y_resampled))


Resampled class distribution:
Counter({3: 167, 5: 165, 4: 134, 2: 116, 12: 116, 13: 111, 14: 96, 6: 94, 11: 91, 9: 81, 10: 80, 1: 75, 8: 59, 7: 58})
