## Step 1: Load and Explore the Dataset

In [24]:
import pandas as pd

# Load the Iris dataset
iris_data = pd.read_csv('https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv')

# Display the first few rows of the dataset
print(iris_data.head())


   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


## Step 2: Data Exploration

In [25]:
# Get summary statistics for numerical features
print(iris_data.describe())

# Get class distribution
print(iris_data['species'].value_counts())


       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64


## Step 3: Data Preprocessing

In [26]:
from sklearn.preprocessing import LabelEncoder

# Encode the 'species' column
label_encoder = LabelEncoder()
iris_data['species'] = label_encoder.fit_transform(iris_data['species'])

# Split data into features (X) and target (y)
X = iris_data.drop('species', axis=1)
y = iris_data['species']


## Step 4: Data Splitting

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 5: Building the Predictive Mode#

In [28]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression()

## Step 6: Model Evaluation

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

confusion = confusion_matrix(y_test, y_pred)
print('\nConfusion Matrix:')
print(confusion)


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## Step 7: Model Deployment

In [30]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'iris_trained_model.pkl')


['iris_trained_model.pkl']

In [31]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_data(data):
    # Create a copy of the input DataFrame to avoid modifying the original data
    processed_data = data.copy()

    # Identify categorical and numerical columns
    categorical_columns = processed_data.select_dtypes(include=['object']).columns.tolist()
    numerical_columns = processed_data.select_dtypes(exclude=['object']).columns.tolist()

    # Encode categorical variables using label encoding
    label_encoders = {}
    for column in categorical_columns:
        label_encoders[column] = LabelEncoder()
        processed_data[column] = label_encoders[column].fit_transform(processed_data[column])

    # Scale numerical features using StandardScaler
    scaler = StandardScaler()
    processed_data[numerical_columns] = scaler.fit_transform(processed_data[numerical_columns])

    return processed_data


In [32]:
import pandas as pd

# Read the new data from a CSV file
new_data_df = pd.read_csv('irisInput.csv')  # Replace 'new_data.csv' with your new data file

# Preprocess the new data using the same preprocessing functions
new_data_processed = preprocess_data(new_data_df)


In [33]:
new_data_processed

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.65314,1.008104,-1.048773,-1.044551
1,-0.860343,-0.309305,-1.048773,-1.044551
2,-1.067546,0.217659,-1.096825,-1.044551
3,-1.171147,-0.045823,-1.000722,-1.044551
4,-0.756741,1.271585,-1.048773,-1.044551
5,-0.342335,2.06203,-0.904619,-0.831944
6,-1.171147,0.744622,-1.048773,-0.938248
7,-0.756741,0.744622,-1.000722,-1.044551
8,-1.37835,-0.572786,-1.048773,-1.044551
9,-0.860343,-0.045823,-1.000722,-1.150855


In [34]:
import joblib

# Load the trained model
model = joblib.load('iris_trained_model.pkl')  # Replace with your model file path


In [35]:
model

LogisticRegression()

In [36]:
# Use the loaded model to make predictions on the preprocessed new data
predictions = model.predict(new_data_processed)

# The 'predictions' variable now contains the predicted labels (e.g., Churn_Yes or Churn_No) for the new data.


In [37]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])