In [11]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import re
import string
import gc

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
#data ingestion
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv")

In [13]:
df = df[df['sentiment'].isin(["happiness", "sadness"])]

In [14]:
#data preprocessing
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    return " ".join([word for word in text.split() if word not in stop_words])

def removing_numbers(text):
    return ''.join([char for char in text if not char.isdigit()])

def lower_case(text):
    return " ".join([word.lower() for word in text.split()])

def removing_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('؛', "")
    return re.sub('\s+', ' ', text).strip()

def removing_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_small_sentences(df):
    df['content'] = df['content'].apply(lambda x: np.nan if len(str(x).split()) < 3 else x)
    return df

def normalize_text(df):
    try:
        df['content'] = df['content'].apply(lower_case)
        df['content'] = df['content'].apply(remove_stop_words)
        df['content'] = df['content'].apply(removing_numbers)
        df['content'] = df['content'].apply(removing_punctuations)
        df['content'] = df['content'].apply(removing_urls)
        df['content'] = df['content'].apply(lemmatization)
        df = remove_small_sentences(df)
        return df.dropna(subset=['content'])
    except Exception as e:
        print(e)

In [15]:
df = normalize_text(df)

In [16]:
df['sentiment'].replace({
    'sadness':0,
    'happiness':1
}, inplace=True)

In [17]:
#log experiment on dagshub mlflow
import dagshub

dagshub.init(repo_owner='iamprashantjain', repo_name='mini_project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/iamprashantjain/mini_project.mlflow")
mlflow.set_experiment("BOW vs TFIDF")

<Experiment: artifact_location='mlflow-artifacts:/15b17dc73b2848e5acac2737b0800840', creation_time=1748408326228, experiment_id='1', last_update_time=1748408326228, lifecycle_stage='active', name='BOW vs TFIDF', tags={}>

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizers = {
    'bow':CountVectorizer(),
    'tfidf':TfidfVectorizer()
}


from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


algorithms = {
    'logisticregression': LogisticRegression(solver='saga'),
    'multinomialnb': MultinomialNB(),
    'randomforest': RandomForestClassifier(),
    'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'gradientboosting': GradientBoostingClassifier()
}

In [19]:
for algo, algorithm in algorithms.items():
    for vec, vectorizer in vectorizers.items():
        print(f"{algo} - {vec}")

logisticregression - bow
logisticregression - tfidf
multinomialnb - bow
multinomialnb - tfidf
randomforest - bow
randomforest - tfidf
xgboost - bow
xgboost - tfidf
gradientboosting - bow
gradientboosting - tfidf


In [20]:
#start parent run
with mlflow.start_run(run_name='all_experiments') as parent_run:
    # loop through all algorithms for each vectorization method
    for algo, algorithm in algorithms.items():
        for vec, vectorizer in vectorizers.items():
            with mlflow.start_run(run_name=f"{algo} with {vec}", nested=True) as child_run:
                X = vectorizer.fit_transform(df['content'])
                y = df['sentiment']
                
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                
                #log vectorizer & params
                mlflow.log_param("vectorizer", vec)
                mlflow.log_param("algorithm", algo)
                mlflow.log_param("test_size", 0.2)
                
                #model training
                model = algorithm
                model.fit(X_train, y_train)
                
                #log model params
                if algo == "LogisticRegression":
                    mlflow.log_param("C", model.C)
                    
                elif algo == "MultinomialNB":
                    mlflow.log_param("alpha", model.alpha)
                    
                elif algo == "XGBoost":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("learnining_rate", model.learnining_rate)
                
                elif algo == "RandomForest":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("max_depth", model.max_depth)
                    
                elif algo == "GradientBoosting":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("learnining_rate", model.learnining_rate)
                    mlflow.log_param("max_depth", model.max_depth)
                    
                    
                #model evaluation
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred)
                
                
                #log evaluation metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("f1", f1)
                
                
                #log model
                mlflow.sklearn.log_model(model, "model")
                
                
                #log notebook
                # import os
                # notebook_path = "exp1.ipynb"
                # os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
                # mlflow.log_artifact(notebook_path)
                
                
                print("accuracy", accuracy)
                print("precision", precision)
                print("recall", recall)
                print("f1", f1)
                
                del model  # delete model object
                gc.collect()  # force garbage collection



accuracy 0.7813131313131313
precision 0.772189349112426
recall 0.7949238578680203
f1 0.783391695847924


2025/05/28 18:02:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run logisticregression with bow at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/71aa3a8e85914a1e89fb7fcd272d29c7.
2025/05/28 18:02:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7843434343434343
precision 0.7751479289940828
recall 0.7979695431472081
f1 0.7863931965982992


2025/05/28 18:02:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run logisticregression with tfidf at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/337e768330d043e08526ed71f7f8cf2f.
2025/05/28 18:02:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7838383838383839
precision 0.7833163784333672
recall 0.7817258883248731
f1 0.782520325203252


2025/05/28 18:02:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run multinomialnb with bow at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/a15dbc0ad8c04085a172b16e4c60cc85.
2025/05/28 18:02:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7752525252525253
precision 0.765748031496063
recall 0.7898477157360406
f1 0.7776111944027986


2025/05/28 18:03:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run multinomialnb with tfidf at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/d45448c3f7eb47dda3d914391564de85.
2025/05/28 18:03:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7747474747474747
precision 0.7569113441372736
recall 0.8060913705583757
f1 0.7807276302851524


2025/05/28 18:06:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run randomforest with bow at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/916023a848a349f5a7f9dbba25cca988.
2025/05/28 18:06:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7631313131313131
precision 0.7371323529411765
recall 0.8142131979695432
f1 0.773757838880849


2025/05/28 18:09:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run randomforest with tfidf at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/d8e70cab07294b1e9a2caed0ada2695c.
2025/05/28 18:09:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.
Parameters: { "use_label_encoder" } are not used.



accuracy 0.7575757575757576
precision 0.726457399103139
recall 0.8223350253807107
f1 0.7714285714285715


2025/05/28 18:10:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost with bow at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/9e6cdc02482d47e5b24b8c8c224078c3.
2025/05/28 18:10:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.
Parameters: { "use_label_encoder" } are not used.



accuracy 0.7484848484848485
precision 0.714160070360598
recall 0.8243654822335026
f1 0.765315739868049


2025/05/28 18:10:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost with tfidf at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/fed89e8b1c1640df9ea5fe06d8877fd3.
2025/05/28 18:10:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7141414141414142
precision 0.6622773044151821
recall 0.868020304568528
f1 0.7513181019332161


2025/05/28 18:11:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run gradientboosting with bow at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/074ca5636dad41a298bf2cd624e0835c.
2025/05/28 18:11:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.


accuracy 0.7116161616161616
precision 0.6592307692307692
recall 0.8700507614213198
f1 0.750109409190372


2025/05/28 18:13:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run gradientboosting with tfidf at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/c9721d8a3a844d33b2c70c87bb3fedbc.
2025/05/28 18:13:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.
2025/05/28 18:13:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run all_experiments at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1/runs/0b4d038dd1c643599271c02131eb96d1.
2025/05/28 18:13:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/mini_project.mlflow/#/experiments/1.
