# Logistic Regression Model - Identifying Currently Automated Tasks - NLP Approach - Word2Vec

#### Loading The Data

In [None]:
import pandas as pd

data = pd.read_csv("software_engineering_tasks.csv")
data.head()

Unnamed: 0,Role,Task,Currently Automated by AI
0,Frontend Developer,Designing responsive UI layouts,No
1,Frontend Developer,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Frontend Developer,Debugging browser compatibility issues,Partially
3,Frontend Developer,Ensuring accessibility compliance,No
4,Frontend Developer,Integrating UI designs from tools like Figma,No


#### Check The Data

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Role                       540 non-null    object
 1   Task                       540 non-null    object
 2   Currently Automated by AI  540 non-null    object
dtypes: object(3)
memory usage: 12.8+ KB


#### Rename And Refactor

In [None]:
# drop the role column
data.drop(columns=['Role'], inplace=True)

data.head()

Unnamed: 0,Task,Currently Automated by AI
0,Designing responsive UI layouts,No
1,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Debugging browser compatibility issues,Partially
3,Ensuring accessibility compliance,No
4,Integrating UI designs from tools like Figma,No


In [None]:
# rename the columns
data.rename(columns={'Task':'task_description'}, inplace=True)
data.rename(columns={'Currently Automated by AI':'is_automated'}, inplace=True)

data.head()

Unnamed: 0,task_description,is_automated
0,Designing responsive UI layouts,No
1,"Writing modular and reusable HTML, CSS, and Ja...",No
2,Debugging browser compatibility issues,Partially
3,Ensuring accessibility compliance,No
4,Integrating UI designs from tools like Figma,No


In [None]:
# converting task description to lowercase
data['task_description'] = data['task_description'].str.lower()

# convert the is_automated column to 1 and 0 s
data['is_automated'] = data['is_automated'].map({'Partially':1, 'No':0})

data.head()

Unnamed: 0,task_description,is_automated
0,designing responsive ui layouts,0
1,"writing modular and reusable html, css, and ja...",0
2,debugging browser compatibility issues,1
3,ensuring accessibility compliance,0
4,integrating ui designs from tools like figma,0


In [None]:
# dropping the duplicates
data.drop_duplicates(inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539 entries, 0 to 539
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   task_description  539 non-null    object
 1   is_automated      539 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 12.6+ KB


In [None]:
# export cleaned dataset
data.to_csv('software_engineering_tasks_cleaned.csv', index=False)

#### Word2Vec Training

In [None]:
from gensim.models import Word2Vec
import numpy as np

data['tokenized'] = data['task_description'].apply(lambda x: x.split())

word2vec_model = Word2Vec(sentences=data['tokenized'], vector_size=100, window=5, min_count=1, workers=4, epochs=20)

def get_average_word_vector(tokens, model):
    """Compute the average word vector for a list of tokens."""
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

data['word2vec'] = data['tokenized'].apply(lambda x: get_average_word_vector(x, word2vec_model))

#### Training The Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = np.array(data['word2vec'].tolist())
y = data['is_automated']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, predictions)
print(report)

Accuracy: 0.4351851851851852
              precision    recall  f1-score   support

           0       0.44      0.96      0.61        49
           1       0.00      0.00      0.00        59

    accuracy                           0.44       108
   macro avg       0.22      0.48      0.30       108
weighted avg       0.20      0.44      0.28       108



#### Test With New Data

In [None]:
new_task = "debugging rendering issues"

new_task_tokens = new_task.split()

new_task_vector = get_average_word_vector(new_task_tokens, word2vec_model)

prediction = model.predict([new_task_vector])

print(f"Prediction: {'Automated' if prediction[0] == 1 else 'Not Automated'}")

Prediction: Not Automated
