# Random Forest

In [None]:
#import required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Loading the dataset
iris=pd.read_csv("Iris.csv")
iris.head()

In [None]:
iris["Species"].unique()

#### Data Visualization

In [None]:
fig = iris[iris.Species=='Iris-setosa'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='orange', label='Setosa')
iris[iris.Species=='Iris-versicolor'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='blue', label='versicolor',ax=fig)
iris[iris.Species=='Iris-virginica'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='green', label='virginica', ax=fig)
fig.set_xlabel("Sepal Length")
fig.set_ylabel("Sepal Width")
fig.set_title("Sepal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(10,6)
plt.show()
fig.savefig("Sepal Length VS Width.png")


In [None]:
fig = iris[iris.Species=='Iris-setosa'].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='orange', label='Setosa')
iris[iris.Species=='Iris-versicolor'].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='blue', label='versicolor',ax=fig)
iris[iris.Species=='Iris-virginica'].plot.scatter(x='PetalLengthCm',y='PetalWidthCm',color='green', label='virginica', ax=fig)
fig.set_xlabel("Petal Length")
fig.set_ylabel("Petal Width")
fig.set_title(" Petal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(10,6)
plt.show()
fig.savefig("Petal Length VS Width.png")

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='PetalLengthCm',data=iris)
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='PetalWidthCm',data=iris)
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='SepalLengthCm',data=iris)
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='SepalWidthCm',data=iris)
fig.savefig("variable with species.png")

#### Train-Test Datast split
#### Here, X will consists of all the features and y will consist of the target variable.
#### The model will look for patterns using the training data and using the testing data it will how good the model works

In [None]:
# Extracting the features in the dataset
# Here X is a 2D array with 150 samples and 5 features
X=iris.iloc[:, :-1].values
X

In [None]:
# Extracting the traget variable
#It is a 1D array with 150 samples and one tartget varaible
y=iris.iloc[:, -1].values
y

In [None]:
#Splitting into training set and test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 123)

#### Feature Scaling
#### Standard Scalar: to standardize the features  (mean =0, variance =1). 
#### fit_transform(X_train) : it will calculate the mean and standard deviation for every feature and convert the training data into scaled values.
#### transform(X_test): applying the already trained mean and std deviation in the test data, so maintaint he consistency.

In [None]:
from sklearn.preprocessing import StandardScaler
# Intializing the Standard Scaler
sc = StandardScaler()

# Fit and Transform taining data
X_train_scaled = sc.fit_transform(X_train)

#tranforming the testing data
X_test_scaled = sc.transform(X_test)

In [None]:
print("X_train before scaling (first 5 rows):\n", X_train[:5])
print("X_train after scaling (first 5 rows):\n", X_train_scaled[:5])
print("X_test after scaling (first 5 rows):\n", X_test_scaled[:5])

#### Model Training

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
classifier.fit(X_train_scaled, y_train)

In [None]:
# Making predictions on sclaed test data
y_pred = classifier.predict(X_test_scaled)

In [None]:
#Evaluating the model
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
cm=confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion matrix:\n", cm)

In [None]:
from sklearn.metrics import classification_report
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
import numpy as np
feature_names = iris.columns[:-1]  # ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature Importances:")
for i in indices:
    print(f"{feature_names[i]}: {importances[i]}")

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search.best_params_)
best_classifier = grid_search.best_estimator_
y_pred_best = best_classifier.predict(X_test_scaled)
print("Accuracy with Best Model:", accuracy_score(y_test, y_pred_best))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=iris['Species'].unique(), yticklabels=iris['Species'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()