In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load data from CSV
csv_file_path = 'data.csv'
df = pd.read_csv(csv_file_path)

# Drop rows with NaN values
df = df.dropna()

# Extract data and labels
data = df.iloc[:, :-1].values  # Features (all columns except the last one)
labels = df.iloc[:, -1].values  # Labels (last column)

# Convert labels to numeric values
label_encoder = LabelEncoder()
labels_numeric = label_encoder.fit_transform(labels)

# Split data
x_train, x_test, y_train, y_test = train_test_split(data, labels_numeric, test_size=0.2, shuffle=True, stratify=labels_numeric)

# Debugging: Print shapes and data types
print("x_train shape:", x_train.shape, "  x_train dtype:", x_train.dtype)
print("y_train shape:", y_train.shape, "  y_train dtype:", y_train.dtype)

# Initialize and train the model
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Make predictions
y_predict = model.predict(x_test)

# Evaluate the model
score = accuracy_score(y_predict, y_test)
print('{}% of samples were classified correctly!'.format(score * 100))

# Save the trained model
with open('model.p', 'wb') as f:
    pickle.dump({'model': model, 'label_encoder': label_encoder, 'labels': labels}, f)


x_train shape: (237, 42)   x_train dtype: float64
y_train shape: (237,)   y_train dtype: int64
98.33333333333333% of samples were classified correctly!
