# <center> <span style="color:Blue">Programacion 2</span> </center>
<center> <span style="color:Gray">  Challenge 2: Analizing Comments on Glassdoor </span>  </center>
<center> <span style="color:Gray"> Ilse Arredondo Reyes. No. Alumno 323019078</span>  </center>

### Project Stages  

**Stage 1: Web Scrapping**  

Will create a pipeline to extract Information of website Glassdoor
○ Web Scraping
This endpoint will receive as input Web scrapping:
- Open web page
- Read the content
- Extract content
- Save all in a data frame

----

**Stage 2: Model Creation**

🔹 **Text Preprocessing** 

This endpoint will receive as input Web scrapping result in a data frame
- Create English-Spanish data frame
- Analyze Separately Spanish and English data frame (Corpus)
- Data cleaning
- Stop words.
- Lemmatization
- N-grmas Distributions

🔹 **Classification proposed**

This endpoint will receive as input parameters of text preprocessing:

- Construction model
- End Date (date time)
- Taring model
- Calculation of grammatical probabilities

🔹 **Extraction of main features**
- Classification
- Sentimental Analysis (pysentimiento vs vader)

---

**Stage 3: Create a pipeline to MLOps**

🔹 **The MLOps part will be done with mlflow performing the following tasks**

- **Log metrics**
- **Model signatures**
- **Save the plot and log it as an artifact**
- **Tracking url (localhost)**
- **Run MLOps**



In [1]:
# Import data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_glassdoor(url):
    # Setup Selenium options (headless)
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    
    try:
        # Open the Glassdoor page
        driver.get(url)
        time.sleep(5)  # wait for page to load completely
        
        # Read page content
        html = driver.page_source
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Example: Extract job titles (customize selectors based on Glassdoor structure)
        job_titles = [tag.get_text(strip=True) for tag in soup.select('a.jobLink span')]
        companies = [tag.get_text(strip=True) for tag in soup.select('div.jobHeader span')]
        locations = [tag.get_text(strip=True) for tag in soup.select('span.subtle.loc')]
        
        # Combine data into a DataFrame
        data = pd.DataFrame({
            'Job Title': job_titles,
            'Company': companies,
            'Location': locations
        })
        
        return data
    
    finally:
        driver.quit()

# Example usage
url = 'https://www.glassdoor.com/Job/software-engineer-jobs-SRCH_KO0,17.htm'
df = scrape_glassdoor(url)
print(df.head())


Empty DataFrame
Columns: [Job Title, Company, Location]
Index: []


In [23]:
import pandas as pd
import re
from langdetect import detect
import spacy
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import nltk

# Download NLTK stopwords
nltk.download("stopwords")

# Load spaCy models
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

stopwords_en = set(stopwords.words('english'))
stopwords_es = set(stopwords.words('spanish'))

# --- Utility functions ---

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

def clean_text(text):
    text = re.sub(r'\W+', ' ', text)
    return text.lower().strip()

def preprocess_text(text, lang):
    if lang == "en":
        doc = nlp_en(text)
        return [token.lemma_ for token in doc if token.lemma_ not in stopwords_en and not token.is_punct and not token.is_space]
    elif lang == "es":
        doc = nlp_es(text)
        return [token.lemma_ for token in doc if token.lemma_ not in stopwords_es and not token.is_punct and not token.is_space]
    return []

def extract_ngrams(tokens, n=2):
    return list(ngrams(tokens, n))

def get_ngram_distribution(token_lists, n=2, top_n=10):
    all_ngrams = []
    for tokens in token_lists:
        all_ngrams.extend(extract_ngrams(tokens, n))
    return Counter(all_ngrams).most_common(top_n)

# --- Main processing function ---

def text_preprocessing_pipeline(df, text_column="Job Title"):
    df = df.copy()
    
    # Clean text and detect language
    df["Clean Text"] = df[text_column].fillna("").apply(clean_text)
    df["Lang"] = df["Clean Text"].apply(detect_language)

    # Process text: tokenize, lemmatize, remove stopwords
    df["Tokens"] = df.apply(lambda row: preprocess_text(row["Clean Text"], row["Lang"]), axis=1)
    
    # Extract bigrams
    df["Bigrams"] = df["Tokens"].apply(lambda tokens: extract_ngrams(tokens, n=2))

    # Split by language
    df_en = df[df["Lang"] == "en"].reset_index(drop=True)
    df_es = df[df["Lang"] == "es"].reset_index(drop=True)

    # N-gram distributions
    bigrams_en = get_ngram_distribution(df_en["Tokens"], n=2)
    bigrams_es = get_ngram_distribution(df_es["Tokens"], n=2)

    results = {
        "full_df": df,
        "english_df": df_en,
        "spanish_df": df_es,
        "english_bigrams_distribution": bigrams_en,
        "spanish_bigrams_distribution": bigrams_es,
    }
    
    return results


ModuleNotFoundError: No module named 'spacy'

In [16]:
pip install spacy langdetect nltk

Collecting spacy
  Using cached spacy-3.8.3-cp39-cp39-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.12-cp39-cp39-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp39-cp39-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp39-cp39-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Using cached thinc-8.3.6-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Using cached srsly-2.5.1-cp39-cp39-win_amd64.wh

  error: subprocess-exited-with-error
  
  × Building wheel for blis (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [32 lines of output]
      BLIS_COMPILER? None
      !!
      
              ********************************************************************************
              Please consider removing the following classifiers in favor of a SPDX license expression:
      
              License :: OSI Approved :: BSD License
      
              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
              ********************************************************************************
      
      !!
        self._finalize_license_expression()
      running bdist_wheel
      running build
      running build_py
      creating build\lib.win-amd64-cpython-39\blis
      copying blis\about.py -> build\lib.win-amd64-cpython-39\blis
      copying blis\benchmark.py -> build\lib.win-amd64-cpython-39\blis
      copying 