In [8]:
# Data handling
import pandas as pd
import re
import string

# Text preprocessing (NLTK)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Label encoding
from sklearn.preprocessing import LabelEncoder

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jilln\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jilln\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Load the dataset
df = pd.read_csv('data/ict_subfields_dataset.csv')

# Display the first few rows to verify
df.head()

Unnamed: 0,Text,Subfield,Job Title
0,I have experience in full-stack development an...,Computer Science,Software Engineer
1,I focus on building scalable software solution...,Computer Science,Software Engineer
2,"I specialize in data science, working with Ten...",Computer Science,Machine Learning Engineer
3,I enjoy using statistical methods and algorith...,Computer Science,Machine Learning Engineer
4,"I have experience in writing clean, efficient ...",Computer Science,Programmer


In [13]:
# Data Preprocessing
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    """
    This function preprocesses the input text by performing the following operations:
    
    1. **Lowercasing**: Converts all characters in the text to lowercase to ensure uniformity. 
       This step is crucial for text normalization, as words like "Apple" and "apple" should be treated the same.
    
    2. **Removing Punctuation**: Eliminates all punctuation characters (e.g., commas, periods, question marks) 
       to avoid them interfering with the processing of words. This is done using string translation and the `string.punctuation` module.

    3. **Removing Numbers**: Removes any numerical digits that appear in the text using a regular expression (`\d+`), 
       which matches any sequence of digits. This is helpful if numbers aren't relevant for the task (e.g., in sentiment analysis).

    4. **Removing Extra Whitespace**: Consolidates any consecutive spaces into a single space and strips leading/trailing spaces.
       This is done with a regular expression (`\s+`) to ensure the text is properly spaced and doesn't contain unwanted gaps.

    5. **Removing Stopwords**: Eliminates common words that don't provide much value in analysis, such as "the", "is", "and", etc.
       This is accomplished using a predefined list of stopwords from the `nltk.corpus.stopwords` module.

    6. **Lemmatization**: Reduces words to their base form or root word. For example, "running" becomes "run", and "better" becomes "good". 
       This step ensures that similar words are treated as the same, which is particularly useful for tasks like text classification and sentiment analysis.
    
    The function returns the cleaned and preprocessed text as output, which is ready for further analysis or modeling.
    """
    
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    # Lemmatize the text
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)

    return text

# Apply the preprocessing function to the 'Text' column and create a new column 'Processed_Text'
df['Processed_Text'] = df['Text'].apply(preprocess_text)

# Preview the result
df[['Text', 'Processed_Text']].head(10)


Unnamed: 0,Text,Processed_Text
0,I have experience in full-stack development an...,experience fullstack development work javascri...
1,I focus on building scalable software solution...,focus building scalable software solution opti...
2,"I specialize in data science, working with Ten...",specialize data science working tensorflow ker...
3,I enjoy using statistical methods and algorith...,enjoy using statistical method algorithm solve...
4,"I have experience in writing clean, efficient ...",experience writing clean efficient code langua...
5,I love solving complex problems through progra...,love solving complex problem programming optim...
6,I enjoy developing backend services and APIs u...,enjoy developing backend service apis using py...
7,Building scalable and maintainable application...,building scalable maintainable application som...
8,I like implementing machine learning algorithm...,like implementing machine learning algorithm f...
9,Experimenting with deep learning frameworks li...,experimenting deep learning framework like pyt...


In [14]:
# Initialize CountVectorizer with bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # unigrams and bigrams

# Transform the processed text into bigram features
X = vectorizer.fit_transform(df['Processed_Text'])

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Subfield']
)  # Replace 'Label' with your actual label column name

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print(vectorizer.get_feature_names_out()[:20])  # First 20 bigram features


# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

['10' '10 usc' '103' '103 volunteer' '15277' '15277 may' '1599f' '15k'
 '15k hiring' '1832150' '1832150 may' '25218b' '25218b temp' '256th'
 '256th intelligence' '2d3d' '2d3d animation' '32' '32 excepted'
 '32 position']
Accuracy: 0.6521739130434783
Classification Report:
                                         precision    recall  f1-score   support

                      Computer Science       0.67      0.22      0.33         9
Entertainment and Multimedia Computing       0.60      1.00      0.75         3
                                    IT       0.55      0.86      0.67         7
                   Information Systems       1.00      1.00      1.00         3
       Library and Information Science       1.00      1.00      1.00         1

                              accuracy                           0.65        23
                             macro avg       0.76      0.82      0.75        23
                          weighted avg       0.68      0.65      0.61        23



In [6]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional but recommended for WordNet's multilingual data

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jilln\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jilln\AppData\Roaming\nltk_data...


True

In [22]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming df is your dataset and user_input is the string provided by the user

# Function to preprocess user input (same as your preprocessing function)
def preprocess_user_input(user_input):
    return preprocess_text(user_input)  # Assuming preprocess_text is defined as before

# Preprocess the user input
user_input = "I like animation and digital art."  # Example user input
processed_user_input = preprocess_user_input(user_input)

# Vectorize the user input using the same vectorizer used for the dataset
user_input_vector = vectorizer.transform([processed_user_input])  # Transform into TF-IDF vector

# Vectorize the entire dataset (assuming 'Processed_Text' column is preprocessed)
dataset_vectors = vectorizer.transform(df['Processed_Text'])

# Compute the cosine similarity between the user input vector and the dataset vectors
cosine_similarities = cosine_similarity(user_input_vector, dataset_vectors)

# Find the index of the most similar document in the dataset
most_similar_index = np.argmax(cosine_similarities)

# Retrieve the corresponding label (subfield) from the dataset
most_similar_subfield = df['Subfield'].iloc[most_similar_index]

# Retrieve the recommended job for the matched subfield
subfield_jobs = {
    'Computer Science': 'Software Engineer',
    'Computer Science': 'Machine Learning Engineer',
    'Computer Science': 'Programmer',
    'Information Technology': 'Network Administrator',
    'Information Technology': 'System Analyst',
    'Information Technology': 'Cybersecurity Expert',
    'Information Systems': 'Business Analyst',
    'Information Systems': 'IT Project Manager',
    'Information Systems': 'ERP Specialist',
    'Entertainment and Multimedia Computing': 'Game Developer',
    'Entertainment and Multimedia Computing': 'Multimedia Artist',
    'Entertainment and Multimedia Computing': 'Technical Animator',
    'Library and Information Science': 'Digital Librarian',
    'Library and Information Science': 'Information Architect',
    'Library and Information Science': 'Records Manager'
}

recommended_job = subfield_jobs.get(most_similar_subfield, "Job recommendation not found.")

# Output the most similar subfield and the recommended job
print(f"The most similar subfield is: {most_similar_subfield}")
print(f"Recommended job: {recommended_job}")


The most similar subfield is: Entertainment and Multimedia Computing
Recommended job: Technical Animator
