# <center> <span style="color:Blue">Programacion 2</span> </center>
<center> <span style="color:Gray">  Challenge 2: Analizing Comments on Glassdoor </span>  </center>
<center> <span style="color:Gray"> Ilse Arredondo Reyes. No. Alumno 323019078</span>  </center>

### Project Stages  

**Stage 1: Web Scrapping**  

Will create a pipeline to extract Information of website Glassdoor
○ Web Scraping
This endpoint will receive as input Web scrapping:
- Open web page
- Read the content
- Extract content
- Save all in a data frame

----

**Stage 2: Model Creation**

🔹 **Text Preprocessing** 

This endpoint will receive as input Web scrapping result in a data frame
- Create English-Spanish data frame
- Analyze Separately Spanish and English data frame (Corpus)
- Data cleaning
- Stop words.
- Lemmatization
- N-grmas Distributions

🔹 **Classification proposed**

This endpoint will receive as input parameters of text preprocessing:

- Construction model
- End Date (date time)
- Taring model
- Calculation of grammatical probabilities

🔹 **Extraction of main features**
- Classification
- Sentimental Analysis (pysentimiento vs vader)

---

**Stage 3: Create a pipeline to MLOps**

🔹 **The MLOps part will be done with mlflow performing the following tasks**

- **Log metrics**
- **Model signatures**
- **Save the plot and log it as an artifact**
- **Tracking url (localhost)**
- **Run MLOps**



In [1]:
# Import data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

## Web Scrapping

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_glassdoor(url):
    # Setup Selenium options (headless)
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    
    try:
        # Open the Glassdoor page
        driver.get(url)
        time.sleep(5)  # wait for page to load completely
        
        # Read page content
        html = driver.page_source
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Example: Extract job titles (customize selectors based on Glassdoor structure)
        job_titles = [tag.get_text(strip=True) for tag in soup.select('a.jobLink span')]
        companies = [tag.get_text(strip=True) for tag in soup.select('div.jobHeader span')]
        locations = [tag.get_text(strip=True) for tag in soup.select('span.subtle.loc')]
        
        # Combine data into a DataFrame
        data = pd.DataFrame({
            'Job Title': job_titles,
            'Company': companies,
            'Location': locations
        })
        
        return data
    
    finally:
        driver.quit()

# Example usage
url = 'https://www.glassdoor.com/Job/software-engineer-jobs-SRCH_KO0,17.htm'
df = scrape_glassdoor(url)
print(df.head())


Empty DataFrame
Columns: [Job Title, Company, Location]
Index: []


## Text Proccessing

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Assume `df` is your DataFrame from Stage 1
def preprocess_text(df, text_column='Job Title'):
    df = df.copy()
    df.dropna(subset=[text_column], inplace=True)
    
    lemmatizer = WordNetLemmatizer()
    stop_words_en = set(stopwords.words('english'))
    
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-záéíóúñü ]', '', text)  # keep letters and Spanish accents
        tokens = text.split()
        tokens = [t for t in tokens if t not in stop_words_en]
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        return ' '.join(tokens)
    
    df['cleaned_text'] = df[text_column].apply(clean_text)
    
    return df

# Example
df_cleaned = preprocess_text(df)
print(df_cleaned.head())


Empty DataFrame
Columns: [Job Title, Company, Location, cleaned_text]
Index: []


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## English-Spanish Corpus Separation

In [4]:
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

df_cleaned['lang'] = df_cleaned['cleaned_text'].apply(detect_language)
df_en = df_cleaned[df_cleaned['lang'] == 'en']
df_es = df_cleaned[df_cleaned['lang'] == 'es']


## N-grams Distribution

In [5]:
def get_top_ngrams(corpus, n=None, ngram_range=(1, 2)):
    vec = CountVectorizer(ngram_range=ngram_range).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Example for English unigrams and bigrams
print(get_top_ngrams(df_en['cleaned_text'], n=10, ngram_range=(1,1)))
print(get_top_ngrams(df_en['cleaned_text'], n=10, ngram_range=(2,2)))


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Example: create labels based on job titles
df_en['label'] = df_en['Job Title'].apply(lambda x: 'engineer' if 'engineer' in x.lower() else 'other')

# Text Classification Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    df_en['cleaned_text'], df_en['label'], test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
# English: Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

df_en['sentiment_vader'] = df_en['cleaned_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Spanish: pysentimiento
from pysentimiento import create_analyzer

analyzer = create_analyzer(task="sentiment", lang="es")
df_es['sentiment'] = df_es['cleaned_text'].apply(lambda x: analyzer.predict(x).output)
