# Logistic Regression Model - Identifying Currently Automated Tasks - NLP Approach

#### Loading The Data

In [17]:
import pandas as pd

data = pd.read_csv("software_engineering_tasks.csv")
data.head()

Unnamed: 0,Role,Task,Currently Automated by AI
0,Frontend Developer,Designing responsive UI layouts,No
1,Frontend Developer,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Frontend Developer,Debugging browser compatibility issues,Partially
3,Frontend Developer,Ensuring accessibility compliance,No
4,Frontend Developer,Integrating UI designs from tools like Figma,No


#### Check The Data

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Role                       540 non-null    object
 1   Task                       540 non-null    object
 2   Currently Automated by AI  540 non-null    object
dtypes: object(3)
memory usage: 12.8+ KB


#### Rename And Refactor

In [19]:
# drop the role column
data.drop(columns=['Role'], inplace=True)

data.head()

Unnamed: 0,Task,Currently Automated by AI
0,Designing responsive UI layouts,No
1,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Debugging browser compatibility issues,Partially
3,Ensuring accessibility compliance,No
4,Integrating UI designs from tools like Figma,No


In [20]:
# rename the columns
data.rename(columns={'Task':'task_description'}, inplace=True)
data.rename(columns={'Currently Automated by AI':'is_automated'}, inplace=True)

data.head()

Unnamed: 0,task_description,is_automated
0,Designing responsive UI layouts,No
1,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Debugging browser compatibility issues,Partially
3,Ensuring accessibility compliance,No
4,Integrating UI designs from tools like Figma,No


In [21]:
# converting task description to lowercase
data['task_description'] = data['task_description'].str.lower()

# convert the is_automated column to 1 and 0 s
data['is_automated'] = data['is_automated'].map({'Partially':1, 'No':0})

data.head()

Unnamed: 0,task_description,is_automated
0,designing responsive ui layouts,0
1,"writing modular and reusable html, css, and ja...",0
2,debugging browser compatibility issues,1
3,ensuring accessibility compliance,0
4,integrating ui designs from tools like figma,0


In [22]:
# dropping the duplicates
data.drop_duplicates(inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539 entries, 0 to 539
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   task_description  539 non-null    object
 1   is_automated      539 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 12.6+ KB


In [23]:
# export cleaned dataset
data.to_csv('software_engineering_tasks_cleaned.csv', index=False)

#### Training The Model

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['task_description'])
y = data['is_automated']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, predictions)
print(report)

Accuracy: 0.7685185185185185
              precision    recall  f1-score   support

           0       0.70      0.86      0.77        49
           1       0.85      0.69      0.77        59

    accuracy                           0.77       108
   macro avg       0.78      0.78      0.77       108
weighted avg       0.78      0.77      0.77       108



#### Test With New Data

In [32]:
new_task = ["integration tests"]

new_task_vectorized = vectorizer.transform(new_task)

prediction = model.predict(new_task_vectorized)

print(f"Prediction: {'Automated' if prediction[0] == 1 else 'Not Automated'}")

Prediction: Automated
