In [4]:
# Data Preprocessing

import pandas as pd

# Load the dataset
df = pd.read_csv('data/ict_subfields_dataset.csv')

# Display the first few rows to verify
print(df.head())

                                                Text          Subfield  \
0  I have experience in full-stack development an...  Computer Science   
1  I focus on building scalable software solution...  Computer Science   
2  I specialize in data science, working with Ten...  Computer Science   
3  I enjoy using statistical methods and algorith...  Computer Science   
4  I have experience in writing clean, efficient ...  Computer Science   

                   Job Title  
0          Software Engineer  
1          Software Engineer  
2  Machine Learning Engineer  
3  Machine Learning Engineer  
4                 Programmer  


In [11]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    """
    This function preprocesses the input text by performing the following operations:
    
    1. **Lowercasing**: Converts all characters in the text to lowercase to ensure uniformity. 
       This step is crucial for text normalization, as words like "Apple" and "apple" should be treated the same.
    
    2. **Removing Punctuation**: Eliminates all punctuation characters (e.g., commas, periods, question marks) 
       to avoid them interfering with the processing of words. This is done using string translation and the `string.punctuation` module.

    3. **Removing Numbers**: Removes any numerical digits that appear in the text using a regular expression (`\d+`), 
       which matches any sequence of digits. This is helpful if numbers aren't relevant for the task (e.g., in sentiment analysis).

    4. **Removing Extra Whitespace**: Consolidates any consecutive spaces into a single space and strips leading/trailing spaces.
       This is done with a regular expression (`\s+`) to ensure the text is properly spaced and doesn't contain unwanted gaps.

    5. **Removing Stopwords**: Eliminates common words that don't provide much value in analysis, such as "the", "is", "and", etc.
       This is accomplished using a predefined list of stopwords from the `nltk.corpus.stopwords` module.

    6. **Lemmatization**: Reduces words to their base form or root word. For example, "running" becomes "run", and "better" becomes "good". 
       This step ensures that similar words are treated as the same, which is particularly useful for tasks like text classification and sentiment analysis.
    
    The function returns the cleaned and preprocessed text as output, which is ready for further analysis or modeling.
    """
    
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    # Lemmatize the text
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)

    return text

# Apply the preprocessing function to the 'Text' column and create a new column 'Processed_Text'
df['Processed_Text'] = df['Text'].apply(preprocess_text)

# Preview the result
print(df[['Text', 'Processed_Text']].head(10))


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject