In [21]:
# Logistic Regression Model using scikit-learn
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Load dataset
# Why header and names are important in read_csv:
# - header=None tells pandas there is no header row in the CSV file, so it does not treat the first row as column names.
# - names=columns lets you assign custom column names to the DataFrame, making it easier to reference columns by name instead of index.
# - This improves code readability and prevents errors when selecting or dropping columns.
# If you use only the names attribute without specifying header, pandas assumes the first row of your data is the header and replaces it with your custom names.
# This means you will lose the first row of actual data, as it will be treated as column names.
# To avoid this, always use header=None when your CSV file does not have a header row.
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None, names=columns)
iris_data.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:

# Prepare features and labels
# axis=1 is needed in drop() to specify that we want to drop a column, not a row.
# axis=0 means drop a row, axis=1 means drop a column.
# For example, iris_data.drop('species', axis=1) drops the 'species' column.
x = iris_data.drop(columns=['species'])
y = iris_data['species']


In [13]:

x.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
y.head()


0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: species, dtype: object

In [17]:

# Create and train the model
model = LogisticRegression()
model.fit(x,y)

# Make predictions
# Fix: Convert prediction input to DataFrame with correct column names to avoid sklearn warning.
test_data = pd.DataFrame([[5.1, 3.5, 1.4, 0.2], [6.7, 3.1, 4.7, 1.5], [7.2, 3.6, 6.1, 2.5]], columns=x.columns)
predictions = model.predict(test_data)
print(predictions)
# This code is part of Oracle AI Foundation Course
# which teaches to build a logistic regression model using scikit-learn library in Python.
# The dataset used is the Iris dataset, which is a classic dataset for classification tasks.
# The code loads the dataset, prepares the features and labels, creates and trains a logistic regression model, and makes predictions on new data points.
# The predictions are printed to the console.


['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [34]:
# Logistic Regression Model using scikit-learn
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load dataset
iris_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=columns)
iris_data.head()

# Prepare features and labels
x = iris_data.drop(columns=['species'])
y = iris_data['species']

#split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

#standardize the features
# Why fit_transform only on x_train and not on x_test:
# - fit_transform() calculates the mean and std from x_train and applies scaling.
# - transform() uses the same mean and std to scale x_test.
# - This ensures the test data is scaled using parameters learned from the training data only, preventing data leakage and making evaluation fair.
scaler = StandardScaler() # --- scaler is instance of class StandardScaler
x_train_scaled = scaler.fit_transform(x_train) # --- fit_transform() fits the scaler to the training data and transforms it
x_test_scaled = scaler.transform(x_test)

x_train_scaled[:5]  # Display first 5 rows of scaled training data

array([[-1.47393679,  1.22037928, -1.5639872 , -1.30948358],
       [-0.13307079,  3.02001693, -1.27728011, -1.04292204],
       [ 1.08589829,  0.09560575,  0.38562104,  0.28988568],
       [-1.23014297,  0.77046987, -1.21993869, -1.30948358],
       [-1.7177306 ,  0.32056046, -1.39196294, -1.30948358]])

In [36]:

# Create and train the model
model = LogisticRegression()
model.fit(x_train_scaled, y_train) #---model learns the relationship between the features and the labels during this step.

#evaluate the model on the test set
y_pred = model.predict(x_test_scaled)
y_pred[:5]  # Display first 5 predictions

array(['Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor'], dtype=object)

In [None]:
accuracy = accuracy_score(y_pred, y_test)
print(f"Model accuracy on test set: {accuracy*100:.2f}%")

In [None]:
#sample new data for prediction
# Use DataFrame instead of numpy array to preserve feature names and avoid sklearn warnings.
new_data = pd.DataFrame([[5.1, 3.5, 1.4, 0.2], [6.7, 3.1, 4.7, 1.5], [7.2, 3.6, 6.1, 2.5]], columns=x.columns)
new_data_scaled = scaler.transform(new_data) #scale the new data using the same scaler

# make predictions on new scaled data
new_predictions = model.predict(new_data_scaled)
print("Predictions on new data:", new_predictions)

Predictions on new data: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
