In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# prompt: read xlsx file using pandas

data = pd.read_excel(r"/content/drive/MyDrive/Actual.xlsx")


In [None]:
data

In [None]:
class_counts = data['actual Data'].value_counts()
print(class_counts)

In [None]:
from sklearn.utils import resample

value_counts = data['actual Data'].value_counts()

# Get the maximum count (for class 7)
max_count = value_counts.max()

upsampled_dfs = {}

# Iterate over each class
for class_label, count in value_counts.items():
    # Skip class 7 since we don't want to upsample it
    if class_label == 7:
        continue

    # Get the DataFrame for the current class
    class_df = data[data['actual Data'] == class_label]

    # Upsample the current class to match the maximum count
    upsampled_df = resample(class_df,
                            replace=True,
                            n_samples=max_count,
                            random_state=42)

    # Store the upsampled DataFrame in the dictionary
    upsampled_dfs[class_label] = upsampled_df

# Combine the upsampled DataFrames and the original DataFrame for class 7
upsampled_df_list = list(upsampled_dfs.values()) + [data[data['actual Data'] == 7]]

# Concatenate all the upsampled DataFrames
upsampled_df = pd.concat(upsampled_df_list, ignore_index=True)

In [None]:
print(upsampled_df['actual Data'].value_counts())

In [None]:
upsampled_df.describe().T

In [None]:
upsampled_df.info()

In [None]:
numeric_cols = upsampled_df.select_dtypes(include=['float64', 'int64']).columns

# Plot a box plot for each numeric feature
fig, axes = plt.subplots(nrows=len(numeric_cols), ncols=1, figsize=(10, len(numeric_cols) * 4))

for i, col in enumerate(numeric_cols):
    ax = axes[i] if len(numeric_cols) > 1 else axes
    sns.boxplot(data=upsampled_df, x=upsampled_df[col], ax=ax)
    ax.set_title(f'Box Plot for {col}')
    ax.set_ylabel(col)

plt.tight_layout()
plt.show()

# Handle outliers and skewed distributions
from scipy import stats

for col in numeric_cols:
    # Check for outliers
    q1 = upsampled_df[col].quantile(0.25)
    q3 = upsampled_df[col].quantile(0.75)
    iqr = q3 - q1
    outlier_lower = q1 - 1.5 * iqr
    outlier_upper = q3 + 1.5 * iqr
    upsampled_df[col] = upsampled_df[col].clip(outlier_lower, outlier_upper)

    # Check for skewness and apply log transformation if needed
    skewness = stats.skew(upsampled_df[col])
    if abs(skewness) > 0.5:
        upsampled_df[col] = np.log1p(upsampled_df[col])

In [None]:
fig, axes = plt.subplots(nrows=len(numeric_cols), ncols=1, figsize=(10, len(numeric_cols) * 4))

for i, col in enumerate(numeric_cols):
    ax = axes[i] if len(numeric_cols) > 1 else axes
    sns.boxplot(data=upsampled_df, x=upsampled_df[col], ax=ax)
    ax.set_title(f'Box Plot for {col}')
    ax.set_ylabel(col)

plt.tight_layout()
plt.show()

In [None]:
print(upsampled_df['actual Data'].value_counts())

In [None]:
upsampled_df.describe().T

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Split data into features and target
X = upsampled_df.drop('actual Data', axis=1)
y = upsampled_df['actual Data']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Encode the target column
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [None]:
import tensorflow as tf
from tensorflow import keras

# Define the number of input features
num_features = X_train_scaled.shape[1]

# Define the number of classes
num_classes = len(encoder.classes_)

# Define the neural network architecture
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(num_features,)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train_encoded,
                    batch_size=32,
                    epochs=100,
                    validation_split=0.2,
                    verbose=1)

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from sklearn.pipeline import Pipeline
# Define the transformers
scaler = StandardScaler()
label_encoder = LabelEncoder()

# Create the pipeline
pipeline = Pipeline([
    ('scaler', scaler),
])

# Fit the pipeline on your training data
pipeline.fit(X_train)

# For inference, use the pipeline to transform the input data
def predict(input_data):
    # Assuming input_data is a list or numpy array of length 5
    transformed_data = pipeline.transform(input_data)
    prediction = model.predict(transformed_data)
    return prediction

In [None]:
# change the values of the next array for inference
x_sample = np.array([   376, 575, 146, 1092 ,146   ]).reshape(1, -1)

# Make a prediction on the sample
y_pred = predict(x_sample)

# Get the predicted class label
predicted_class = np.argmax(y_pred, axis=1)[0]

# Print the prediction
print(f"Predicted label: {encoder.inverse_transform([predicted_class])[0]}")

# Print the predicted probabilities for all classes
print("\nPredicted probabilities:")
for i, prob in enumerate(y_pred[0]):
    print(f"Class {encoder.inverse_transform([i])[0]}: {prob * 100:.2f}%")