# <center> <span style="color:Blue">Programacion 2</span> </center>
<center> <span style="color:Gray">  Challenge 2: Analizing Comments on Glassdoor </span>  </center>
<center> <span style="color:Gray"> Ilse Arredondo Reyes. No. Alumno 323019078</span>  </center>

### Project Stages  

**Stage 1: Web Scrapping**  

Will create a pipeline to extract Information of website Glassdoor
○ Web Scraping
This endpoint will receive as input Web scrapping:
- Open web page
- Read the content
- Extract content
- Save all in a data frame

----

**Stage 2: Model Creation**

🔹 **Text Preprocessing** 

This endpoint will receive as input Web scrapping result in a data frame
- Create English-Spanish data frame
- Analyze Separately Spanish and English data frame (Corpus)
- Data cleaning
- Stop words.
- Lemmatization
- N-grmas Distributions

🔹 **Classification proposed**

This endpoint will receive as input parameters of text preprocessing:

- Construction model
- End Date (date time)
- Taring model
- Calculation of grammatical probabilities

🔹 **Extraction of main features**
- Classification
- Sentimental Analysis (pysentimiento vs vader)

---

**Stage 3: Create a pipeline to MLOps**

🔹 **The MLOps part will be done with mlflow performing the following tasks**

- **Log metrics**
- **Model signatures**
- **Save the plot and log it as an artifact**
- **Tracking url (localhost)**
- **Run MLOps**



In [16]:
# Import data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

## Web Scrapping

In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_glassdoor(url):
    # Setup Selenium options (headless)
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)
    
    try:
        # Open the Glassdoor page
        driver.get(url)
        time.sleep(5)  # wait for page to load completely
        
        # Read page content
        html = driver.page_source
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Example: Extract job titles (customize selectors based on Glassdoor structure)
        job_titles = [tag.get_text(strip=True) for tag in soup.select('a.jobLink span')]
        companies = [tag.get_text(strip=True) for tag in soup.select('div.jobHeader span')]
        locations = [tag.get_text(strip=True) for tag in soup.select('span.subtle.loc')]
        
        # Combine data into a DataFrame
        data = pd.DataFrame({
            'Job Title': job_titles,
            'Company': companies,
            'Location': locations
        })
        
        return data
    
    finally:
        driver.quit()

# Example usage
url = 'https://www.glassdoor.com/Job/software-engineer-jobs-SRCH_KO0,17.htm'
df = scrape_glassdoor(url)
print(df.head())


Empty DataFrame
Columns: [Job Title, Company, Location]
Index: []


## Text Proccessing

In [18]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words_en = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, lang='english'):
    # Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words_en]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

def process_dataframe(df):
    # Assume 'Job Title' is the text to be analyzed
    df['Cleaned'] = df['Job Title'].apply(preprocess_text)
    
    # N-gram distribution (example: bigrams)
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    X = vectorizer.fit_transform(df['Cleaned'])
    ngram_freq = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()).sum().sort_values(ascending=False)
    
    return df, ngram_freq


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ilse-\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## N-grams Function

In [19]:
def get_ngrams(texts, n=2):
    vec = CountVectorizer(ngram_range=(n, n))
    ngram_matrix = vec.fit_transform(texts)
    ngram_freq = zip(vec.get_feature_names_out(), ngram_matrix.sum(axis=0).tolist()[0])
    return sorted(ngram_freq, key=lambda x: x[1], reverse=True)



## Classification (Multinomial Naive Bayes)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

def train_text_classifier(df, text_column, label_column):
    X_train, X_test, y_train, y_test = train_test_split(df[text_column], df[label_column], test_size=0.2, random_state=42)

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    return pipeline


## Sentiment Analysis (VADER)

In [21]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = vader.polarity_scores(text)
    return score


## MLOps con MLflow

In [65]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, output_path="conf_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()


In [66]:
from wordcloud import WordCloud

def plot_wordcloud(ngram_freq, output_path="wordcloud.png"):
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(dict(ngram_freq))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()


In [67]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from wordcloud import WordCloud

def plot_confusion_matrix(y_true, y_pred, output_path="conf_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

def plot_wordcloud(ngram_freq, output_path="wordcloud.png"):
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(dict(ngram_freq))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()


In [68]:
def mlflow_train_and_log(df, text_column='Cleaned', label_column='Label', experiment_name='Glassdoor NLP'):
    import mlflow
    import mlflow.sklearn
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from mlflow.models.signature import infer_signature

    mlflow.set_experiment(experiment_name)

    with mlflow.start_run():
        # Entrenamiento
        X_train, X_test, y_train, y_test = train_test_split(df[text_column], df[label_column], test_size=0.2, random_state=42)

        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Métricas
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
        mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted', zero_division=0))
        mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted', zero_division=0))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred, average='weighted', zero_division=0))

        # Log modelo
        mlflow.sklearn.log_model(pipeline, "model")
        mlflow.sklearn.log_model(pipeline, "model_with_signature", signature=infer_signature(X_test, y_pred))

        # Guardar CSV
        if 'Sentiment' in df.columns:
            df[['Job Title', text_column, 'Sentiment']].to_csv("processed_data.csv", index=False)
        else:
            df[['Job Title', text_column]].to_csv("processed_data.csv", index=False)
        mlflow.log_artifact("processed_data.csv")

        # Visuales: WordCloud y Confusion Matrix
        top_ngrams = get_ngrams(df[text_column])
        plot_wordcloud(top_ngrams[:50], "wordcloud.png")
        mlflow.log_artifact("wordcloud.png")

        plot_confusion_matrix(y_test, y_pred, "conf_matrix.png")
        mlflow.log_artifact("conf_matrix.png")

        print("✔️ MLflow run completed successfully.")
        return pipeline


In [72]:
pipeline = mlflow_train_and_log(df, text_column='Cleaned', label_column='Label')

KeyError: 'Cleaned'

In [69]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix
import os

def plot_wordcloud(ngram_freq, output_path="wordcloud.png"):
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(dict(ngram_freq))
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

def plot_confusion_matrix(y_true, y_pred, output_path="conf_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()


In [70]:
        # Visual: WordCloud de n-gramas
        top_ngrams = get_ngrams(df[text_column])
        plot_wordcloud(top_ngrams[:50], "wordcloud.png")
        mlflow.log_artifact("wordcloud.png")

        # Visual: Confusion Matrix
        plot_confusion_matrix(y_test, y_pred, "conf_matrix.png")
        mlflow.log_artifact("conf_matrix.png")


NameError: name 'text_column' is not defined